我正在尝试使用C#收集一些数据。我有一个以独特的非标准方式输出数据的系统。我需要定期从平面文件中解析这些数据,然后将其导入数据库。我还需要尽可能快地进行解析。数据库的东西我工作得很好,这很简单。我需要帮助找出解析文件的最佳方法。目前,有大约15000行,每天增加更多。这是一个数据。第一行是数据在平面文件中的显示方式。第二位是一个更容易理解的导入数据视图。
{a test entry} {{1}{{city}{chicago}{employee}{johnsmith}{building}{5}{room}{506A}{room}{506B}{id}{1234}}{2}{{city}{losangeles}{employee}{johnsmith}{building}{1}{room}{101A}{room}{102B}{id}{1234}}}
{a test entry}
{
{1}
{
{city} {chicago}
{employee} {johnsmith}
{building} {5}
{room} {506A}
{room} {506B}
{id} {1234}
}
{2}
{
{city} {losangeles}
{employee} {johnsmith}
{building} {1}
{room} {101A}
{id} {1234}
}
}
每个条目可以是一个子条目的任何地方(意味着{2}下没有数据),或者它可以继续有数百个。
我应该如何解决这个问题呢?我尝试了一些分裂和子串的东西,但是我有不同的成功,而且它很慢。
有什么方法可以简单地解析我正在查看的数据吗?
答案 0 :(得分:3)
创建一个堆栈并按字符处理输入字符串:
var stack = new Stack<StringBuilder>();
foreach (var ch in input)
{
if (ch == '{')
{
stack.Push(new StringBuilder());
}
else if (ch == '}')
{
var item = stack.Pop().ToString();
Console.WriteLine(new string(' ', stack.Count * 2) + item);
}
else if (stack.Count != 0)
{
stack.Peek().Append(ch);
}
}
输出:
a test entry
1
city
chicago
employee
johnsmith
building
5
room
506A
room
506B
id
1234
2
city
losangeles
employee
johnsmith
building
1
room
101A
room
102B
id
1234
现在您已经解析了数据,您只需要确定要将数据放入哪个数据结构。
答案 1 :(得分:0)
这样的事情:
static void Main(string[] args)
{
int index = 0;
string text = "{a test entry} {{1}{{city}{chicago}{employee}{johnsmith}{building}{5}{room}{506A}{room}{506B}{id}{1234}}{2}{{city}{losangeles}{employee}{johnsmith}{building}{1}{room}{101A}{room}{102B}{id}{1234}}}";
var tokens = Tokenize(text);
var node = Parse(new Node(new Token() { TokenType = TokenType.Root, Value = string.Empty }), tokens, ref index);
RaiseSubtrees(node);
Console.WriteLine(node.ToString());
}
static List<Token> Tokenize(string text)
{
Stack<StringBuilder> stack = new Stack<StringBuilder>();
List<Token> tokens = new List<Token>();
foreach (var ch in text)
{
if (ch == '{')
{
stack.Push(new StringBuilder());
tokens.Add(new Token(TokenType.ObjectStart, "{" ));
}
else if (ch == '}')
{
var item = stack.Pop().ToString();
if (!string.IsNullOrEmpty(item))
{
tokens.Add(new Token(TokenType.Text, item));
}
tokens.Add(new Token(TokenType.ObjectEnd, "}"));
}
else if (stack.Count != 0)
{
stack.Peek().Append(ch);
}
}
return tokens;
}
static Node Parse(Node parent, List<Token> tokens, ref int index)
{
for (; index < tokens.Count - 1; index++)
{
Token current = tokens[index];
Token next = tokens[index + 1];
if (current.TokenType == TokenType.ObjectStart)
{
Node child = new Node(current);
parent.Children.Add(child);
index++;
Parse(child, tokens, ref index);
}
else if (current.TokenType == TokenType.Entry || current.TokenType == TokenType.Text)
{
Node child = new Node(current);
parent.Children.Add(child);
}
else if (current.TokenType == TokenType.ObjectEnd)
{
return parent;
}
}
return parent;
}
static void RaiseSubtrees(Node node)
{
if (node.Children.Count == 1)
{
node.Token = node.Children.First().Token;
node.Children.Clear();
}
else
{
foreach (Node child in node.Children)
{
RaiseSubtrees(child);
}
if (node.Children.All(c => c.Token.TokenType == TokenType.Text))
{
for (int i = node.Children.Count - 1; i >= 1; i-=2)
{
Node keyNode = node.Children[i - 1];
Node valueNode = node.Children[i];
keyNode.Token.TokenType = TokenType.Key;
valueNode.Token.TokenType = TokenType.Value;
Node newParent = new Node(new Token(TokenType.Property, string.Empty));
newParent.Children.Add(keyNode);
newParent.Children.Add(valueNode);
node.Children.RemoveAt(i);
node.Children.RemoveAt(i - 1);
node.Children.Insert(i - 1, newParent);
}
}
}
}
enum TokenType
{
Entry,
Key,
ObjectStart,
ObjectEnd,
Property,
Root,
Text,
Value
}
class Token
{
public TokenType TokenType { get; set; }
public string Value { get; set; }
public Token()
{
}
public Token(TokenType tokenType, string value)
{
this.TokenType = tokenType;
this.Value = value;
}
}
class Node
{
public Token Token { get; set; }
public IList<Node> Children { get; set; }
public Node(Token token)
{
this.Token = token;
this.Children = new List<Node>();
}
public override string ToString()
{
StringBuilder builder = new StringBuilder();
ToString(this, builder, string.Empty);
return builder.ToString();
}
public void ToString(Node parent, StringBuilder builder, string indent)
{
builder.Append(indent).Append(parent.Token.TokenType.ToString());
if (parent.Token.TokenType != TokenType.Root && parent.Token.TokenType != TokenType.ObjectStart)
{
builder.Append(": ").Append(parent.Token.Value);
}
builder.Append("\n");
foreach (var child in parent.Children)
{
ToString(child, builder, indent + " ");
}
}
}
这对于标记化使用类似于dtb的方法,但我然后使用Node
类来创建一个对数据建模的树。这应该允许您以更结构化的方式处理数据。上面Main
方法的输出如下所示:
Root
Text: a test entry
ObjectStart
Text: 1
ObjectStart
Property:
Key: city
Value: chicago
Property:
Key: employee
Value: johnsmith
Property:
Key: building
Value: 5
Property:
Key: room
Value: 506A
Property:
Key: room
Value: 506B
Property:
Key: id
Value: 1234
Text: 2
ObjectStart
Property:
Key: city
Value: losangeles
Property:
Key: employee
Value: johnsmith
Property:
Key: building
Value: 1
Property:
Key: room
Value: 101A
Property:
Key: room
Value: 102B
Property:
Key: id
Value: 1234