我有以下节点,我在流读取器中检索。可能有很多这些。我只想检索此节点中的几个组,例如REPLICATE_ID, ASSAY_NUMBER,FEW DATES FIELDS
。
节点内字段的顺序可能不同,有时也会出现新字段,但我想要提取的字段不会改变。
到目前为止,我所使用的正则表达式匹配整个节点,因此如果节点有新字段或顺序不同,它会中断。是否有可能匹配我感兴趣的群组?
TEST_REPLICATE
{
REPLICATE_ID 453w
ASSAY_NUMBER 334
ASSAY_VERSION 4
ASSAY_STATUS test
DILUTION_ID 1
SAMPLE_ID "NC_dede"
SAMPLE_TYPE Specimen
TEST_ORDER_DATE 05.23.2012
TEST_ORDER_TIME 04:25:07
TEST_INITIATION_DATE 05.23.2012
TEST_INITIATION_TIME 05:19:43
TEST_COMPLETION_DATE 05.23.2012
TEST_COMPLETION_TIME 05:48:01
ASSAY_CALIBRATION_DATE NA
ASSAY_CALIBRATION_TIME NA
TRACK 1
PROCESSING_LANE 1
MODULE_SN "EP004"
LOAD_LIST_NAME C:\BwedwQwedw_SCC\edwLoadlist2RACKSB.json
OPERATOR_ID "Q_dwe"
DARK_SUBREADS 16 23 19 20 16 18 21 16 17 18 19 19 20 22 19 20 19 20 18 20 17 20 21 16 19 23 20 22 19 20
SIGNAL_SUBREADS 18 17 20 21 42 61 41 31 30 30 26 26 25 22 24 DARK_COUNT 577
SIGNAL_COUNT 781
CORRECTED_COUNT 204
STD_BAK 1.95965044971226
AVG_BAK 19.2333333333333
STD_FOR 8.67212471810898
AVG_FOR 26.0333333333333
SHAPE NA
EXCEPTION_STRING TestException - Parameters:Unable to process test, background read failure.
RESULT NA
REPORTED_RESULT NA
REPORTED_RESULT_UNITS NA
REAGENT_MASTER_LOT 13600LI02
REAGENT_SERIAL_NUMBER 25022
RESULT_FLAGS RUO
RESULT_INTERPRETATION NA
DILUTION_PROTOCOL UNDILUTED
RESULT_COMMENT frer 1 LANE A
DATA_MANAGEMENT_FIELD_1 NA
DATA_MANAGEMENT_FIELD_2 NA
DATA_MANAGEMENT_FIELD_3 NA
DATA_MANAGEMENT_FIELD_4 NA
}
string pat = @"TEST_REPLICATE\s*{\s*REPLICATE_ID\s*([^}]*?)\s+ASSAY_NUMBER\s*([^}]*?)\s+ASSAY_VERSION\s*([^}]*?)\s+DILUTION_ID\s*([^}]*?)\s+SAMPLE_ID\s*([^}]*?)\s+SAMPLE_TYPE\s*([^}]*?)\s+TEST_ORDER_DATE\s*([^}]*?)\s+TEST_ORDER_TIME\s*([^}]*?)\s+TEST_INITIATION_DATE\s*([^}]*?)\s+TEST_INITIATION_TIME\s*([^}]*?)\s+TEST_COMPLETION_DATE\s*([^}]*?)\s+TEST_COMPLETION_TIME\s*([^}]*?)\s+ASSAY_CALIBRATION_DATE\s*([^}]*?)\s+ASSAY_CALIBRATION_TIME\s*([^}]*?)\s+TRACK\s*([^}]*?)\s+PROCESSING_LANE\s*([^}]*?)\s+MODULE_SN\s*([^}]*?)\s+LOAD_LIST_NAME\s*([^}]*?)\s+OPERATOR_ID\s*([^}]*?)\s+DARK_SUBREADS\s*([^}]*?)\s+SIGNAL_SUBREADS\s*([^}]*?)\s+DARK_COUNT\s*([^}]*?)\s+SIGNAL_COUNT\s*([^}]*?)\s+CORRECTED_COUNT\s*([^}]*?)\s+STD_BAK\s*([^}]*?)\s+AVG_BAK\s*([^}]*?)\s+STD_FOR\s*([^}]*?)\s+AVG_FOR\s*([^}]*?)\s+SHAPE\s*([^}]*?)\s+EXCEPTION_STRING\s*([^}]*?)\s+RESULT\s*([^}]*?)\s+REPORTED_RESULT\s*([^}]*?)\s+REPORTED_RESULT_UNITS\s*([^}]*?)\s+REAGENT_MASTER_LOT\s*([^}]*?)\s+REAGENT_SERIAL_NUMBER\s*([^}]*?)\s+RESULT_FLAGS\s*([^}]*?)\s+RESULT_INTERPRETATION\s*([^}]*?)\s+DILUTION_PROTOCOL\s*([^}]*?)\s+RESULT_COMMENT\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_1\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_2\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_3\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_4\s*([^}]*?)\s*}";
答案 0 :(得分:0)
是的,您可能只需解析键值对的记录。
如果要从记录中提取键值对,可以使用以下代码示例 找到匹配项后,可以针对捕获集合中的那些键测试您要查找的键。
您还可以更改正则表达式,了解如何允许记录的开始/结束 但是不要改变核心,它可以防止灾难性的回溯。
正则表达式替代方案:
# Record starts on a new line, closing brace can be anywhere
^ [^\S\n]*TEST_REPLICATE\s*\{
(?>
\s* (?<key> [^\s{}]+ ) [^\S\n]* (?<val> [^\n{}]*? ) [^\S\n]* (?:$|(?=\}))
)*
\s*\}
# Record starts anywhere, closing brace is on a new line
TEST_REPLICATE\s*\{
(?>
\s* (?<key> [^\s{}]+ ) [^\S\n]* (?<val> [^\n{}]*? ) [^\S\n]* $
)*
\s*\}
C#测试代码:
Regex testRx = new Regex(
@"
^ [^\S\n]* TEST_REPLICATE # Record, starts on a newline
\s* # Optional whitespaces (trims blank lines)
\{ # Record opening brace
(?> # Atomic group
\s* # Optional many whitespace (trims blank lines)
# Line in record to be recorded
(?<key> [^\s{}]+) # required <key>, not whitespacs nor braces
[^\S\n]* # trim whitespaces (don't include newline)
(?<val> [^\n{}]*?) # optional <value>, not newlines nor braces
[^\S\n]* # trim whitespaces (don't include newline)
(?:$|(?=\})) # End of line, or next char is a closing brace
)* # End atomic group, do many times (optional)
\s* # Optional whitespaces (trims blank lines)
\} # Record closing brace
", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
string testdata = @"
TEST_REPLICATE{}
TEST_REPLICATE{
REPLICATE_ID 1asdf985
ASSAY_NUMBER 123sdg
ASSAY_VERSION 4sdgn
ASSAY_TYPE unknown
}
TEST_REPLICATE
{
REPLICATE_ID
ASSAY_NUMBER 123
ASSAY_VERSION 4
ASSAY_TYPE unknown
DILUTION_ID 1
SAMPLE_ID ""NC_HIV1""
SAMPLE_TYPE Specimen
TEST_ORDER_DATE 05.21.2012
TEST_ORDER_TIME 03:44:01
TEST_INITIATION_DATE 05.21.2012
TEST_INITIATION_TIME 04:03:36
TEST_COMPLETION_DATE 05.21.2012
TEST_COMPLETION_TIME 04:29:32
ASSAY_CALIBRATION_DATE NA
ASSAY_CALIBRATION_TIME NA
TRACK 1
PROCESSING_LANE 1
MODULE_SN ""EP004""
LOAD_LIST_NAME C:\sdddd
OPERATOR_ID ""Q_SI""
DARK_SUBREADS NA
SIGNAL_SUBREADS NA
DARK_COUNT NA
SIGNAL_COUNT NA
CORRECTED_COUNT NA
STD_BAK NA
AVG_BAK NA
STD_FOR NA
AVG_FOR NA
SHAPE NA
EXCEPTION_STRING Test execution was stopped.
RESULT NA
REPORTED_RESULT NA
REPORTED_RESULT_UNITS NA
REAGENT_MASTER_LOT 2345
REAGENT_SERIAL_NUMBER 25022
RESULT_FLAGS NA
RESULT_INTERPRETATION NA
DILUTION_PROTOCOL UNDILUTED
RESULT_COMMENT HIV NC 1
DATA_MANAGEMENT_FIELD_1 NA
DATA_MANAGEMENT_FIELD_2 NA
DATA_MANAGEMENT_FIELD_3 NA
DATA_MANAGEMENT_FIELD_4 NA
}
";
Match m_testrec = testRx.Match(testdata);
// Each match contains a single record
//
while (m_testrec.Success)
{
Console.WriteLine("New Record\n------------------------");
CaptureCollection cc_key = m_testrec.Groups["key"].Captures;
CaptureCollection cc_val = m_testrec.Groups["val"].Captures;
for (int i = 0; i < cc_key.Count; i++)
{
Console.WriteLine("'{0}' = '{1}'", cc_key[i].Value, cc_val[i].Value);
//
// Test specific keys here
// if (cc_key[i].Value == "REAGENT_SERIAL_NUMBER") ...
}
Console.WriteLine("------------------------");
// Get next record
m_testrec = m_testrec.NextMatch();
}