在C#中,使用Regex
类,如何解析以逗号分隔的值,其中某些值可能引用包含逗号的字符串?
using System ;
using System.Text.RegularExpressions ;
class Example
{
public static void Main ( )
{
string myString = "cat,dog,\"0 = OFF, 1 = ON\",lion,tiger,'R = red, G = green, B = blue',bear" ;
Console.WriteLine ( "\nmyString is ...\n\t" + myString + "\n" ) ;
Regex regex = new Regex ( "(?<=,(\"|\')).*?(?=(\"|\'),)|(^.*?(?=,))|((?<=,).*?(?=,))|((?<=,).*?$)" ) ;
Match match = regex.Match ( myString ) ;
int j = 0 ;
while ( match.Success )
{
Console.WriteLine ( j++ + " \t" + match ) ;
match = match.NextMatch() ;
}
}
}
输出(部分)如下所示:
0 cat
1 dog
2 "0 = OFF
3 1 = ON"
4 lion
5 tiger
6 'R = red
7 G = green
8 B = blue'
9 bear
但是,所需的输出是:
0 cat
1 dog
2 0 = OFF, 1 = ON
3 lion
4 tiger
5 R = red, G = green, B = blue
6 bear
答案 0 :(得分:23)
试试这个正则表达式:
"[^"\r\n]*"|'[^'\r\n]*'|[^,\r\n]*
Regex regexObj = new Regex(@"""[^""\r\n]*""|'[^'\r\n]*'|[^,\r\n]*");
Match matchResults = regexObj.Match(input);
while (matchResults.Success)
{
Console.WriteLine(matchResults.Value);
matchResults = matchResults.NextMatch();
}
OUPUTS:
注意:此正则表达式解决方案适用于您的案例,但我建议您使用FileHelpers等专业库。
答案 1 :(得分:21)
为什么不听从专家和Don't roll your own CSV parser的建议。
你的第一个想法是,“我需要在引号内处理逗号。”
你的下一个想法是,“哦,废话,我需要处理引号内的引号。转义引号。双引号。单引号......”
这是通向疯狂的道路。不要自己写。找到一个具有广泛的单元测试覆盖率的图书馆,该图书馆覆盖了所有的硬件,并为您经历了地狱。对于.NET,请使用免费和开源FileHelpers library。
答案 2 :(得分:7)
它不是正则表达式,但我使用了Microsoft.VisualBasic.FileIO.TextFieldParser来完成csv文件。是的,在C#应用程序中添加对Microsoft.VisualBasic的引用可能会有点奇怪,甚至可能有点脏,但是嘿它有效。
答案 3 :(得分:7)
啊,RegEx。现在你有两个问题。 ;)
我使用了一个标记器/解析器,因为它非常简单,更重要的是,更容易阅读以便以后进行维护。
这有效,例如:
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Text;
class Program
{
static void Main(string[] args)
{
string myString = "cat,dog,\"0 = OFF, 1 = ON\",lion,tiger,'R = red, G = green, B = blue',bear";
Console.WriteLine("\nmyString is ...\n\t" + myString + "\n");
CsvParser parser = new CsvParser(myString);
Int32 lineNumber = 0;
foreach (string s in parser)
{
Console.WriteLine(lineNumber + ": " + s);
}
Console.ReadKey();
}
}
internal enum TokenType
{
Comma,
Quote,
Value
}
internal class Token
{
public Token(TokenType type, string value)
{
Value = value;
Type = type;
}
public String Value { get; private set; }
public TokenType Type { get; private set; }
}
internal class StreamTokenizer : IEnumerable<Token>
{
private TextReader _reader;
public StreamTokenizer(TextReader reader)
{
_reader = reader;
}
public IEnumerator<Token> GetEnumerator()
{
String line;
StringBuilder value = new StringBuilder();
while ((line = _reader.ReadLine()) != null)
{
foreach (Char c in line)
{
switch (c)
{
case '\'':
case '"':
if (value.Length > 0)
{
yield return new Token(TokenType.Value, value.ToString());
value.Length = 0;
}
yield return new Token(TokenType.Quote, c.ToString());
break;
case ',':
if (value.Length > 0)
{
yield return new Token(TokenType.Value, value.ToString());
value.Length = 0;
}
yield return new Token(TokenType.Comma, c.ToString());
break;
default:
value.Append(c);
break;
}
}
// Thanks, dpan
if (value.Length > 0)
{
yield return new Token(TokenType.Value, value.ToString());
}
}
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
internal class CsvParser : IEnumerable<String>
{
private StreamTokenizer _tokenizer;
public CsvParser(Stream data)
{
_tokenizer = new StreamTokenizer(new StreamReader(data));
}
public CsvParser(String data)
{
_tokenizer = new StreamTokenizer(new StringReader(data));
}
public IEnumerator<string> GetEnumerator()
{
Boolean inQuote = false;
StringBuilder result = new StringBuilder();
foreach (Token token in _tokenizer)
{
switch (token.Type)
{
case TokenType.Comma:
if (inQuote)
{
result.Append(token.Value);
}
else
{
yield return result.ToString();
result.Length = 0;
}
break;
case TokenType.Quote:
// Toggle quote state
inQuote = !inQuote;
break;
case TokenType.Value:
result.Append(token.Value);
break;
default:
throw new InvalidOperationException("Unknown token type: " + token.Type);
}
}
if (result.Length > 0)
{
yield return result.ToString();
}
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
答案 4 :(得分:7)
只需添加我今天早上工作的解决方案。
var regex = new Regex("(?<=^|,)(\"(?:[^\"]|\"\")*\"|[^,]*)");
foreach (Match m in regex.Matches("<-- input line -->"))
{
var s = m.Value;
}
如您所见,您需要每行调用regex.Matches() 。然后它将返回一个MatchCollection,其具有与列相同数量的项目。显然,每个匹配的Value属性是已解析的值。
这仍然是一项正在进行的工作,但它很乐意解析CSV字符串,如:
2,3.03,"Hello, my name is ""Joshua""",A,B,C,,,D
答案 5 :(得分:3)
CSV不是regular。除非你的正则表达式语言有足够的能力来处理csv解析的状态性质(不太可能,MS不会),那么任何纯正则表达式解决方案都是等待发生的错误列表,当你点击一个不是完全由最后一个正则表达式处理。
CSV读取作为状态机并不复杂,因为语法很简单,但即使如此,你必须考虑:引号,引号内的逗号,引号内的新行,空字段。
因此你应该只使用其他人的CSV解析器。我建议将CSVReader用于.Net
答案 6 :(得分:2)
功能:
private List<string> ParseDelimitedString (string arguments, char delim = ',')
{
bool inQuotes = false;
bool inNonQuotes = false; //used to trim leading WhiteSpace
List<string> strings = new List<string>();
StringBuilder sb = new StringBuilder();
foreach (char c in arguments)
{
if (c == '\'' || c == '"')
{
if (!inQuotes)
inQuotes = true;
else
inQuotes = false;
}else if (c == delim)
{
if (!inQuotes)
{
strings.Add(sb.Replace("'", string.Empty).Replace("\"", string.Empty).ToString());
sb.Remove(0, sb.Length);
inNonQuotes = false;
}
else
{
sb.Append(c);
}
}
else if ( !char.IsWhiteSpace(c) && !inQuotes && !inNonQuotes)
{
if (!inNonQuotes) inNonQuotes = true;
sb.Append(c);
}
}
strings.Add(sb.Replace("'", string.Empty).Replace("\"", string.Empty).ToString());
return strings;
}
用法
string myString = "cat,dog,\"0 = OFF, 1 = ON\",lion,tiger,'R = red, G = green, B = blue',bear, text";
List<string> strings = ParseDelimitedString(myString);
foreach( string s in strings )
Console.WriteLine( s );
输出:
cat
dog
0 = OFF, 1 = ON
lion
tiger
R = red, G = green, B = blue
bear
text
答案 7 :(得分:1)
我在该版本中发现了一些错误,例如,在报价中包含单引号的非引用字符串。
我同意尽可能使用FileHelper库,但是这个库要求你知道你的数据是什么样的......我需要一个通用的解析器。
所以我已将代码更新为以下内容并认为我分享了......
static public List<string> ParseDelimitedString(string value, char delimiter)
{
bool inQuotes = false;
bool inNonQuotes = false;
bool secondQuote = false;
char curQuote = '\0';
List<string> results = new List<string>();
StringBuilder sb = new StringBuilder();
foreach (char c in value)
{
if (inNonQuotes)
{
// then quotes are just characters
if (c == delimiter)
{
results.Add(sb.ToString());
sb.Remove(0, sb.Length);
inNonQuotes = false;
}
else
{
sb.Append(c);
}
}
else if (inQuotes)
{
// then quotes need to be double escaped
if ((c == '\'' && c == curQuote) || (c == '"' && c == curQuote))
{
if (secondQuote)
{
secondQuote = false;
sb.Append(c);
}
else
secondQuote = true;
}
else if (secondQuote && c == delimiter)
{
results.Add(sb.ToString());
sb.Remove(0, sb.Length);
inQuotes = false;
}
else if (!secondQuote)
{
sb.Append(c);
}
else
{
// bad,as,"user entered something like"this,poorly escaped,value
// just ignore until second delimiter found
}
}
else
{
// not yet parsing a field
if (c == '\'' || c == '"')
{
curQuote = c;
inQuotes = true;
inNonQuotes = false;
secondQuote = false;
}
else if (c == delimiter)
{
// blank field
inQuotes = false;
inNonQuotes = false;
results.Add(string.Empty);
}
else
{
inQuotes = false;
inNonQuotes = true;
sb.Append(c);
}
}
}
if (inQuotes || inNonQuotes)
results.Add(sb.ToString());
return results;
}
答案 8 :(得分:0)
因为这个问题:Regex to to parse csv with nested quotes
在这里报告并且更加通用,并且因为RegEx并不是解决此问题的正确方法(即我遇到了许多灾难性回溯问题(http://www.regular-expressions.info/catastrophic.html)
这里是一个简单的Python解析器实现
def csv_to_array(string):
stack = []
match = []
matches = []
for c in string:
# do we have a quote or double quote?
if c == "\"":
# is it a closing match?
if len(stack) > 0 and stack[-1] == c:
stack.pop()
else:
stack.append(c)
elif (c == "," and len(stack) == 0) or (c == "\n"):
matches.append("".join(match))
match = []
else:
match.append(c)
return matches