如何忽略群组

时间:2012-06-19 15:46:00

标签: c# regex

我有以下节点,我在流读取器中检索。可能有很多这些。我只想检索此节点中的几个组,例如REPLICATE_ID, ASSAY_NUMBER,FEW DATES FIELDS

节点内字段的顺序可能不同,有时也会出现新字段,但我想要提取的字段不会改变。

到目前为止,我所使用的正则表达式匹配整个节点,因此如果节点有新字段或顺序不同,它会中断。是否有可能匹配我感兴趣的群组?

   TEST_REPLICATE
    {
        REPLICATE_ID            453w
        ASSAY_NUMBER            334
        ASSAY_VERSION           4
        ASSAY_STATUS            test
        DILUTION_ID         1
        SAMPLE_ID           "NC_dede"
        SAMPLE_TYPE         Specimen
        TEST_ORDER_DATE         05.23.2012
        TEST_ORDER_TIME         04:25:07
        TEST_INITIATION_DATE        05.23.2012
        TEST_INITIATION_TIME        05:19:43
        TEST_COMPLETION_DATE        05.23.2012
        TEST_COMPLETION_TIME        05:48:01
        ASSAY_CALIBRATION_DATE      NA
        ASSAY_CALIBRATION_TIME      NA
        TRACK           1
        PROCESSING_LANE     1
        MODULE_SN       "EP004"
        LOAD_LIST_NAME          C:\BwedwQwedw_SCC\edwLoadlist2RACKSB.json
        OPERATOR_ID         "Q_dwe"
        DARK_SUBREADS           16 23 19 20 16 18 21 16 17 18 19 19 20 22 19 20 19 20 18 20 17 20 21 16 19 23 20 22 19 20
        SIGNAL_SUBREADS         18 17 20 21 42 61 41 31 30 30 26 26 25 22 24        DARK_COUNT          577
        SIGNAL_COUNT            781
        CORRECTED_COUNT         204
        STD_BAK             1.95965044971226
        AVG_BAK             19.2333333333333
        STD_FOR             8.67212471810898
        AVG_FOR             26.0333333333333
        SHAPE               NA
        EXCEPTION_STRING        TestException - Parameters:Unable to process test, background read failure.
        RESULT              NA
        REPORTED_RESULT         NA
        REPORTED_RESULT_UNITS       NA
        REAGENT_MASTER_LOT      13600LI02
        REAGENT_SERIAL_NUMBER       25022
        RESULT_FLAGS            RUO
        RESULT_INTERPRETATION       NA
        DILUTION_PROTOCOL       UNDILUTED
        RESULT_COMMENT          frer 1 LANE A
        DATA_MANAGEMENT_FIELD_1     NA
        DATA_MANAGEMENT_FIELD_2     NA
        DATA_MANAGEMENT_FIELD_3     NA
        DATA_MANAGEMENT_FIELD_4     NA
    }

    string pat = @"TEST_REPLICATE\s*{\s*REPLICATE_ID\s*([^}]*?)\s+ASSAY_NUMBER\s*([^}]*?)\s+ASSAY_VERSION\s*([^}]*?)\s+DILUTION_ID\s*([^}]*?)\s+SAMPLE_ID\s*([^}]*?)\s+SAMPLE_TYPE\s*([^}]*?)\s+TEST_ORDER_DATE\s*([^}]*?)\s+TEST_ORDER_TIME\s*([^}]*?)\s+TEST_INITIATION_DATE\s*([^}]*?)\s+TEST_INITIATION_TIME\s*([^}]*?)\s+TEST_COMPLETION_DATE\s*([^}]*?)\s+TEST_COMPLETION_TIME\s*([^}]*?)\s+ASSAY_CALIBRATION_DATE\s*([^}]*?)\s+ASSAY_CALIBRATION_TIME\s*([^}]*?)\s+TRACK\s*([^}]*?)\s+PROCESSING_LANE\s*([^}]*?)\s+MODULE_SN\s*([^}]*?)\s+LOAD_LIST_NAME\s*([^}]*?)\s+OPERATOR_ID\s*([^}]*?)\s+DARK_SUBREADS\s*([^}]*?)\s+SIGNAL_SUBREADS\s*([^}]*?)\s+DARK_COUNT\s*([^}]*?)\s+SIGNAL_COUNT\s*([^}]*?)\s+CORRECTED_COUNT\s*([^}]*?)\s+STD_BAK\s*([^}]*?)\s+AVG_BAK\s*([^}]*?)\s+STD_FOR\s*([^}]*?)\s+AVG_FOR\s*([^}]*?)\s+SHAPE\s*([^}]*?)\s+EXCEPTION_STRING\s*([^}]*?)\s+RESULT\s*([^}]*?)\s+REPORTED_RESULT\s*([^}]*?)\s+REPORTED_RESULT_UNITS\s*([^}]*?)\s+REAGENT_MASTER_LOT\s*([^}]*?)\s+REAGENT_SERIAL_NUMBER\s*([^}]*?)\s+RESULT_FLAGS\s*([^}]*?)\s+RESULT_INTERPRETATION\s*([^}]*?)\s+DILUTION_PROTOCOL\s*([^}]*?)\s+RESULT_COMMENT\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_1\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_2\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_3\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_4\s*([^}]*?)\s*}";   

1 个答案:

答案 0 :(得分:0)

是的,您可能只需解析键值对的记录。

如果要从记录中提取键值对,可以使用以下代码示例 找到匹配项后,可以针对捕获集合中的那些键测试您要查找的键。

您还可以更改正则表达式,了解如何允许记录的开始/结束 但是不要改变核心,它可以防止灾难性的回溯。

正则表达式替代方案:

# Record starts on a new line, closing brace can be anywhere

^ [^\S\n]*TEST_REPLICATE\s*\{
 (?>
      \s* (?<key> [^\s{}]+ ) [^\S\n]* (?<val> [^\n{}]*? ) [^\S\n]* (?:$|(?=\}))
 )*
 \s*\}


# Record starts anywhere, closing brace is on a new line 

TEST_REPLICATE\s*\{
 (?>
      \s* (?<key> [^\s{}]+ ) [^\S\n]* (?<val> [^\n{}]*? ) [^\S\n]* $
 )*
 \s*\}

C#测试代码:

Regex testRx = new Regex(
 @"
  ^ [^\S\n]* TEST_REPLICATE     # Record, starts on a newline
    \s*                         # Optional whitespaces (trims blank lines)
    \{                          # Record opening brace
      (?>                           # Atomic group
         \s*                           # Optional many whitespace (trims blank lines)
         # Line in record to be recorded
         (?<key> [^\s{}]+)                # required <key>, not whitespacs nor braces
         [^\S\n]*                         # trim whitespaces (don't include newline)
         (?<val> [^\n{}]*?)               # optional <value>, not newlines nor braces
         [^\S\n]*                         # trim whitespaces (don't include newline)
         (?:$|(?=\}))                   # End of line, or next char is a closing brace
      )*                            # End atomic group, do many times (optional)
    \s*                         # Optional whitespaces (trims blank lines)
  \}                            # Record closing brace
 ", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);

string testdata = @"
 TEST_REPLICATE{}
 TEST_REPLICATE{
     REPLICATE_ID            1asdf985
     ASSAY_NUMBER            123sdg
     ASSAY_VERSION           4sdgn
     ASSAY_TYPE            unknown
 }

 TEST_REPLICATE
 {
     REPLICATE_ID            
     ASSAY_NUMBER            123    
     ASSAY_VERSION           4   
     ASSAY_TYPE            unknown   
     DILUTION_ID         1
     SAMPLE_ID           ""NC_HIV1""
     SAMPLE_TYPE         Specimen
     TEST_ORDER_DATE         05.21.2012
     TEST_ORDER_TIME         03:44:01
     TEST_INITIATION_DATE        05.21.2012
     TEST_INITIATION_TIME        04:03:36

 TEST_COMPLETION_DATE        05.21.2012
 TEST_COMPLETION_TIME        04:29:32
     ASSAY_CALIBRATION_DATE              NA
     ASSAY_CALIBRATION_TIME      NA
     TRACK           1
     PROCESSING_LANE     1
     MODULE_SN       ""EP004""
     LOAD_LIST_NAME          C:\sdddd
     OPERATOR_ID         ""Q_SI""
     DARK_SUBREADS           NA
     SIGNAL_SUBREADS         NA
     DARK_COUNT          NA
     SIGNAL_COUNT            NA
     CORRECTED_COUNT         NA
     STD_BAK             NA
     AVG_BAK             NA
     STD_FOR             NA
     AVG_FOR             NA
     SHAPE               NA
     EXCEPTION_STRING        Test execution was stopped.
     RESULT              NA
     REPORTED_RESULT         NA
     REPORTED_RESULT_UNITS       NA
     REAGENT_MASTER_LOT      2345
     REAGENT_SERIAL_NUMBER       25022
     RESULT_FLAGS            NA
     RESULT_INTERPRETATION       NA
     DILUTION_PROTOCOL       UNDILUTED
     RESULT_COMMENT          HIV NC 1
     DATA_MANAGEMENT_FIELD_1     NA
     DATA_MANAGEMENT_FIELD_2     NA
     DATA_MANAGEMENT_FIELD_3     NA
     DATA_MANAGEMENT_FIELD_4     NA
 }
    ";

Match m_testrec = testRx.Match(testdata);

// Each match contains a single record
//
while (m_testrec.Success)
{
    Console.WriteLine("New Record\n------------------------");

    CaptureCollection cc_key = m_testrec.Groups["key"].Captures;
    CaptureCollection cc_val = m_testrec.Groups["val"].Captures;

    for (int i = 0; i < cc_key.Count; i++)
    {
        Console.WriteLine("'{0}' = '{1}'", cc_key[i].Value, cc_val[i].Value);                                                                                                 
        // 
        // Test specific keys here
        //  if (cc_key[i].Value == "REAGENT_SERIAL_NUMBER") ...

    }
    Console.WriteLine("------------------------");

    // Get next record
    m_testrec = m_testrec.NextMatch();
}