我有一个需要解析的2.6GB半csv文件。通过半csv,我的意思是它以(数据,数据2,数据3,...),(更多数据,更多数据2,更多数据3,...),(...)的形式出现。这意味着新行由“)形成,(”而不是换行符(这意味着整个文件基本上是一行)。
我的计划是读取文件并按“)拆分,(”,然后我可以根据需要解析每个元素。显然,C#有一个'内存不足'的问题,但我不能只是将文件拆分,因为我不能保证拆分不会错误地分解数据。有关如何执行此操作的任何想法吗?
答案 0 :(得分:0)
完全未经测试,字符串必须为"mystring"
。不支持字符串中的转义。不支持在字符串中转义"
。因此,这些无效:"my""quote"
或"my\"quote"
。该文件必须是完美的:最后没有eof,最后没有新行,除了字符串内部没有任何空格,没有任何新行,但在字符串内。在字符串内部除了"
(标记字符串的结尾)之外什么都没有,没有元素太多的行,没有元素太少的行,没有null
处理(技术上是,,
对于一个字符串将返回一个没有抛出错误的空字符串)。支持Convert.ChangeType
支持的所有类型。
用法:
using (var fs = new StreamReader("myfile.txt"))
{
foreach (var objs in ParseStream(sr, new Type[] { typeof(int), typeof(double), typeof(string) }, CultureInfo.InvariantCulture))
{
// objs is an object[] where each member is of the type asked
// when ParseStream was called
}
}
码
public static IEnumerable<object[]> ParseStream(TextReader tr, Type[] types, IFormatProvider culture = null)
{
var parts = new List<string>();
var sb = new StringBuilder();
State state = State.WaitingForOpenBracket;
long col = -1;
long row = 0;
int read;
while ((read = tr.Read()) != -1)
{
col++;
char ch = (char)read;
if (ch == '\n')
{
col = 0;
row++;
}
else
{
col++;
}
switch (state)
{
case State.WaitingForOpenBracket:
if (ch != '(')
{
throw new Exception(string.Format("Malformed begin-of-the-row at R: {0}, C: {1}, char: {2}", row, col, ch));
}
state = State.WaitingForData;
break;
case State.WaitingForData:
case State.WaitingForColumnSeparator:
if (ch == ',' || ch == ')')
{
parts.Add(sb.ToString());
sb.Clear();
if (parts.Count > types.Length)
{
throw new Exception(string.Format("Too many parts starting at R: {0}, C: {1}", row, col));
}
if (ch == ')')
{
var parts2 = parts.Select((p, ix) => Convert.ChangeType(p, types[ix], culture ?? CultureInfo.InvariantCulture)).ToArray();
parts.Clear();
yield return parts2;
state = State.WaitingForRowSeparator;
}
}
else
{
if (state == State.WaitingForColumnSeparator)
{
throw new Exception(string.Format("Malformed column separator at R: {0}, C: {1}, char: {2}", row, col, ch));
}
if (ch == '"')
{
if (sb.Length != 0)
{
throw new Exception(string.Format("Malformed string at R: {0}, C: {1}, char: {2}", row, col, ch));
}
state = State.WaitingForEndQuotes;
}
else
{
sb.Append(ch);
}
}
break;
case State.WaitingForEndQuotes:
if (ch == '"')
{
state = State.WaitingForColumnSeparator;
}
else
{
sb.Append(ch);
}
break;
case State.WaitingForRowSeparator:
if (ch != ',')
{
throw new Exception(string.Format("Malformed row separator at R: {0}, C: {1}, char: {2}", row, col, ch));
}
state = State.WaitingForOpenBracket;
break;
}
}
if (state != State.WaitingForRowSeparator)
{
throw new Exception(string.Format("Malformed end-of-file at R: {0}, C: {1}", row, col));
}
}