在C#中使用正则表达式解析文本文件的多个部分

时间:2015-05-26 11:47:28

标签: c# regex

我想解析一个内容类似于下面的文本文件:

START-OF-DATA
#100846105
START SECURITY|US912810DZ85|CBBT|
## in: 20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]
## out:20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]
04/30|15:00:00|B|118.640625||| |A|118.703125||| ||
04/30|14:59:54|B|118.6328125||| |A|118.6953125||| ||
04/30|14:59:52|B|118.6328125||| |A|118.6953125||| ||
04/30|14:59:23|B|118.6328125||| |A|118.6953125||| ||
04/30|14:59:20|B|118.6328125||| |A|118.6953125||| ||
END SECURITY|US912810DZ85|0|
#100846111
START SECURITY|US912810EA26|CBBT|
## in: 20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]
## out:20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]
04/30|15:00:00|B|124.75||| |A|124.828125||| ||
04/30|14:59:55|B|124.75||| |A|124.8203125||| ||
04/30|14:59:53|B|124.7421875||| |A|124.8203125||| ||
04/30|14:59:45|B|124.7421875||| |A|124.8125||| ||
04/30|14:59:43|B|124.7421875||| |A|124.828125||| ||
04/30|14:59:27|B|124.7421875||| |A|124.8125||| ||
04/30|14:59:24|B|124.7421875||| |A|124.828125||| ||
04/30|14:59:22|B|124.7421875||| |A|124.8125||| ||
04/30|14:59:20|B|124.7421875||| |A|124.828125||| ||
04/30|14:59:13|B|124.7421875||| |A|124.8125||| ||
END SECURITY|US912810EA26|0|
END-OF-DATA

使用以下代码

string pattern = @"^(START-OF-DATA\r\n)(?<InstrumentsSection>[^\\]*?)(?:(^END-OF-DATA))";
var expressionMatchColl = regex.Matches(File.ReadAllText(filePath));
            foreach (Match match in expressionMatchColl)
{
                            string[] instrumentRows = match.Groups["InstrumentsSection"].Value.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);
                            instruments = instrumentRows.ToList();
            }

我能够在START-OF-DATA和END-OF-DATA部分中检索每一行。但是,想要忽略以START SECURITY开头的行,## 和终止安全。此外,还希望将分组值和标识符(例如100846105,100846111)分组在不同的组中。

有人可以建议吗?

2 个答案:

答案 0 :(得分:1)

您可以逐行阅读文件,并过滤掉您不想要的行。此外,可以将刻度值和ID收集到字符串列表中。

示例代码:

var res = string.Empty;
var ids = new List<string>();
using (var sr = new StreamReader(filepath, true))
{
    var s = "";
    while ((s = sr.ReadLine()) != null)
    {
       if (s.StartsWith("START-OF-DATA"))
       {
           while (!s.StartsWith("END-OF-DATA"))
           {
              if ( !s.StartsWith("START SECURITY") &&
                   !s.StartsWith("##") &&
                   !s.StartsWith("END SECURITY"))
                   {
                      res += s + System.Environment.NewLine;
                   }
              if (s.StartsWith("#") && !s.StartsWith("##"))
                   ids.Add(s);
               s = sr.ReadLine();
            }
            res += s;
       }
   }
}

输出:

enter image description here

START-OF-DATA
#100846105
04/30|15:00:00|B|118.640625||| |A|118.703125||| ||
04/30|14:59:54|B|118.6328125||| |A|118.6953125||| ||
04/30|14:59:52|B|118.6328125||| |A|118.6953125||| ||
04/30|14:59:23|B|118.6328125||| |A|118.6953125||| ||
04/30|14:59:20|B|118.6328125||| |A|118.6953125||| ||
#100846111
04/30|15:00:00|B|124.75||| |A|124.828125||| ||
04/30|14:59:55|B|124.75||| |A|124.8203125||| ||
04/30|14:59:53|B|124.7421875||| |A|124.8203125||| ||
04/30|14:59:45|B|124.7421875||| |A|124.8125||| ||
04/30|14:59:43|B|124.7421875||| |A|124.828125||| ||
04/30|14:59:27|B|124.7421875||| |A|124.8125||| ||
04/30|14:59:24|B|124.7421875||| |A|124.828125||| ||
04/30|14:59:22|B|124.7421875||| |A|124.8125||| ||
04/30|14:59:20|B|124.7421875||| |A|124.828125||| ||
04/30|14:59:13|B|124.7421875||| |A|124.8125||| ||
END-OF-DATA

然后,如果您要读取多个块,只需创建一个字符串列表以存储res,然后在res += s;之后添加。

答案 1 :(得分:1)

这是一个简单的解析器

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;


namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            List<Section> sections = new List<Section>();
            string input =
               "START-OF-DATA\n" +
               "#100846105\n" +
               "START SECURITY|US912810DZ85|CBBT|\n" +
               "## in: 20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]\n" +
               "## out:20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]\n" +
               "04/30|15:00:00|B|118.640625||| |A|118.703125||| ||\n" +
               "04/30|14:59:54|B|118.6328125||| |A|118.6953125||| ||\n" +
               "04/30|14:59:52|B|118.6328125||| |A|118.6953125||| ||\n" +
               "04/30|14:59:23|B|118.6328125||| |A|118.6953125||| ||\n" +
               "04/30|14:59:20|B|118.6328125||| |A|118.6953125||| ||\n" +
               "END SECURITY|US912810DZ85|0|\n" +
               "#100846111\n" +
               "START SECURITY|US912810EA26|CBBT|\n" +
               "## in: 20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]\n" +
               "## out:20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]\n" +
               "04/30|15:00:00|B|124.75||| |A|124.828125||| ||\n" +
               "04/30|14:59:55|B|124.75||| |A|124.8203125||| ||\n" +
               "04/30|14:59:53|B|124.7421875||| |A|124.8203125||| ||\n" +
               "04/30|14:59:45|B|124.7421875||| |A|124.8125||| ||\n" +
               "04/30|14:59:43|B|124.7421875||| |A|124.828125||| ||\n" +
               "04/30|14:59:27|B|124.7421875||| |A|124.8125||| ||\n" +
               "04/30|14:59:24|B|124.7421875||| |A|124.828125||| ||\n" +
               "04/30|14:59:22|B|124.7421875||| |A|124.8125||| ||\n" +
               "04/30|14:59:20|B|124.7421875||| |A|124.828125||| ||\n" +
               "04/30|14:59:13|B|124.7421875||| |A|124.8125||| ||\n" +
               "END SECURITY|US912810EA26|0|\n" +
               "END-OF-DATA\n";


            StringReader reader = new StringReader(input);
            string inputLine = "";
            Section newSection = null;
            while ((inputLine = reader.ReadLine()) != null)
            {
                inputLine = inputLine.Trim();
                if (inputLine.StartsWith("#"))
                {
                    if (inputLine.Contains("in:")) continue;
                    if (inputLine.Contains("out:")) continue;
                    newSection = new Section();
                    sections.Add(newSection);
                    newSection.iD = inputLine.Substring(1);
                    newSection.data = new List<string>();

                }
                else
                {
                    if (inputLine.Substring(0, 3) == "END") continue;
                    if (inputLine.Substring(0, 5) == "START") continue;
                    newSection.data.Add(inputLine);
                }
            }

        }
        public class Section
        {
            public string iD { get; set; }
            public List<string> data { get; set; }
        }
    }

}