使用DotNet中的RegExp提取嵌入的文本文件

时间:2015-05-28 00:56:57

标签: .net regex

我正在尝试使用RegExp表达式来提取存储在数据库中单个大字段中的文本文件。该字段被检索到一个字符串变量中,并由RegExp DotNet例程处理。

文件的删除方式如下:

%%%%% FILENAME1.TXT %%%%%

(file content)

%%%%% --END-- %%%%%

%%%%% FILENAME2.TXT %%%%%

(file content)

%%%%% --END-- %%%%%

%%%%% FILENAME2.TXT %%%%%

(file content)

%%%%% --END-- %%%%%

%%%%% FILENAME3.RESX %%%%%

(file content)

%%%%% --END-- %%%%%

etc. etc

RegExp如下所示:

RegExpMatchStr := "^%%%%*[\t ]*([A-Z,a-z,_,-,.,\\,:]*)[\t ]*%%%%*\r+(?s)(.*?)^%%%%*[\t ]*(--END--)[\t ]*%%%%*\r+\s*";

然而,当我在代码中测试这个字符串时(以及在各种在线网站上测试dotNet中的regexp)我似乎只得到每个文件的第一个提取。

我错过了什么?

编辑:我正在使用MultiLine和IgnoreCase选项

regex_options := ('RegexOptions'.IgnoreCase as number) + ('RegexOptions'.Multiline as number);

regex_code := NEW NET_OBJECT 'Regex' with RegExpMatchStr as String, regex_options as 'RegexOptions';

single_match := CALL regex_code.Matches with rawcode_text_from_database;

我已经尝试了在线Regex DotNet网站上的代码,它似乎只返回一个匹配集。

EDIT2:好的..所以上面的简化现在适用于上面建议的示例文本,但不适用于更多"真实"代码实际上是什么样的生活版本:

maintenance:
    title:  TEST2;;
    mlmname:  ZMLM_LIB_CSMLM_SAMPLE;;
    arden: version 2.5;;
    version:  0.00;;
    institution:  ;;
    author:  ;;
    specialist:  ;;
    date:  2015-04-10;;
    validation:  testing;;

library:
    purpose:  
    ;;
    explanation:  
    ;;
    keywords:
    ;;
    citations: 
    ;;
knowledge:
    type: data-driven;;
    data:

    standard_libs := MLM 'std_include_libs';
    include standard_libs;

    using "mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089";
    using "System, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089";
    using "System.Configuration, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a";
    using "System.Configuration.Install, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a";
    using "System.Core, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089";
    using "System.Data, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089";

    using namespace "System";
    using namespace "System.IO";
    using namespace "System.Text";
    using namespace "System.Data";
    using namespace "System.Text.RegularExpressions";
    using namespace "System.Collections.Generic";

    source_code_MLM := "ZMLM_LIB_CSMLM_SAMPLE";

    (rawcode_text_from_database) := read first { "  SELECT [Logic] FROM [DEVPNL].[dbo].[CV3MLM] WHERE [Name] = " || SQL(source_code_MLM) };

    //RegExpMatchStr := "^%%%%*[\t ]*([A-Z,a-z,_,-,.]*)[\t ]*%%%%*$(?s)(.*?)^%%%%*[\t ]*(END)%%%%*$";
    //RegExpMatchStr := "^%%%%*[\t ]*([A-Z,a-z,_,-,.,\\,:]*)[\t ]*%%%%*\r+(?s)(.*?)^%%%%*[\t ]*(END)[\t ]*%%%%*\r+\s*";
    RexExpMatchStr := "^%%%%*[\t ]*([-A-Za-z_.\\:\d]*)[\t ]*%%%%*(\r?\n)+(?s)(.*?)^%%%%*[\t ]*(END)[\t ]*%%%%*(\r?\n)+\s*";

    regex_code := NEW NET_OBJECT 'Regex' with RegExpMatchStr as String, 'RegexOptions'.Multiline;
    single_match := CALL regex_code.Matches with rawcode_text_from_database;

break;

    //only getting on match

break;

        single_match := CALL single_match.NextMatch;

    endif;


        //code_lines_array := CALL 'Regex'.Split with code_lines_ONLY_from_database[0] as 'String', "\r\n" as 'String';
        //res_lines_array   := CALL 'Regex'.Split with res_lines_ONLY_from_database[0] as 'String', "\r\n" as 'String';


    ;;
    priority: 50
    ;;
    evoke:  
    ;;
    logic:
        conclude true;
    ;;
    action:
    ;;
Urgency: 50;;
end:

maintenance:
    title:  ;;
    mlmname:  ZMLM_LIB_CSMLM_SAMPLE;;
    arden: version 2.5;;
    version:  0.00;;
    institution:  ;;
    author:  ;;
    specialist:  ;;
    date:  2015-04-10;;
    validation:  testing;;

library:
    purpose:  
    ;;
    explanation:  
    ;;
    keywords:
    ;;
    citations: 
    ;;
knowledge:
    type: data-driven;;
    data:
    ;;
    priority: 50
    ;;
    evoke:  
    ;;
    logic:
        conclude true;
    ;;
    action:
    ;;
Urgency: 50;;
end:

%%%%% C:\TEST\SAMPLE.CS %%%%%

using System;

namespace First
{
    public class Program
    {
        public static void Main()
        {
        Console.WriteLine("Hello, world!");
        }
    }
}

%%%%% --END-- %%%%%




%%%%% C:\TEST\SAMPLE.RESX %%%%%

<?xml version="1.0" encoding="utf-8"?>
<root>
  <!-- 
    Microsoft ResX Schema 

    Version 2.0

    The primary goals of this format is to allow a simple XML format 
    that is mostly human readable. The generation and parsing of the 
    various data types are done through the TypeConverter classes 
    associated with the data types.

%%%%% --END-- %%%%%

1 个答案:

答案 0 :(得分:1)

您必须在行首指定RegexOptions.Multiline选项以匹配^

string input = 
@"%%%%% FILENAME1.TXT %%%%%

(file content)

%%%%% --END-- %%%%%

%%%%% FILENAME2.TXT %%%%%

(file content)

%%%%% --END-- %%%%%

%%%%% FILENAME2.TXT %%%%%

(file content)

%%%%% --END-- %%%%%

%%%%% FILENAME3.RESX %%%%%

(file content)

%%%%% --END-- %%%%%";
string pattern = @"^%%%%*[\t ]*([-A-Za-z_.\\:\d]*)[\t ]*%%%%*(\r?\n)+(?s)(.*?)^%%%%*[\t ]*(--END--)[\t ]*%%%%*(\r?\n)+\s*";
string[] output = Regex.Matches(input, pattern, RegexOptions.Multiline)
                       .OfType<Match>()
                       .Select(m => m.Value)
                       .ToArray();