.NET正则表达式引擎没有返回匹配但我期待8

时间:2012-04-04 14:27:11

标签: c# .net regex

我正在尝试编写一个正则表达式来从SQL脚本中获取每个插入行。当我在Regex Hero I get my expected 8 matches上使用.NET Regex Tester时。但是,当我将此snippit作为控制台应用程序运行时,它不返回任何匹配项。

const string text =
@"INSERT INTO [AdminPrefs] ( [SpayClinic] , [VaxClinic] , [ShelterClinic] , [DateModified] , [Prefix] , [UpdateCounter] , [LockedRecs] , [dbName] , [Timer] , [MedCtrClinic] , [OtherClinic] , [Da2PPPx] , [Da2PPEPx] , [FVRCPPx] , [FVRCPEPx] , [FELVTPx] , [FELVTEPx] , [FELVVPx] , [FELVVEPx] , [HWTPx] , [HWTEPx] , [RabiesPx] , [RabiesEPx] , [FIVTest] , [FIVTestE] , [OnePlusChar] , [XSHWMPx] , [XSHWMEPx] , [SHWMPx] , [SHWMEPx] , [MHWMPx] , [MHWMEPx] , [LHWMPx] , [LHWMEPx] , [DebuggerOn] , [PayThisAmount] , [free6] , [XSHWMPillPx] , [XSHWMPillEPx] , [SHWMPillPx] , [SHWMPillEPx] , [MHWMPillPx] , [MHWMPillEPx] , [LHWMPillPx] , [LHWMPillEPx] , [free7] , [free8] , [free9] , [XSPMPx] , [XSPMEPx] , [SPMPx] , [SPMEPx] , [MPMPx] , [MPMEPx] , [LPMPx] , [LPMEPx] , [ReceiptFooter] , [MonthsUntilBenefits] , [free12] , [XSPMPillPx] , [XSPMPillEPx] , [SPMPillPx] , [SPMPillEPx] , [MPMPillPx] , [MPMPillEPx] , [LPMPillPx] , [LPMPillEPx] , [free14] , [ClinicName] , [ShelterName] , [ShelterAbbr] , [Address1] , [Address2] , [City] , [State] , [ZipCode] , [MainPhone] , [MainFax] , [SplashPict] , [free17] , [free18] , [LicenseNo] , [SerialNo] , [free20] , [free21] , [free22] , [VLogCC] , [SNLogCC] , [free23] , [free24] , [free25] , [AgeAndBDay] , [free26] , [free27] , [free28] , [CurrRouteNum] )
VALUES
(12 , 7 , 0 , '0000/00/00 00:00:00:00' , '' , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , '' , '' , '' , '' , '' , '' , '' , '' , '' , X'5443503408' , 0 , 0 , '' , 0 , 0 , 0 , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0),
(15 , 53 , 0 , '0000/00/00 00:00:00:00' , '' , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , '' , '' , '' , '' , '' , '' , '' , '' , '' , X'5443503408' , 0 , 0 , '' , 0 , 0 , 0 , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0),
(20 , 216 , 0 , '0000/00/00 00:00:00:00' , '' , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , '' , '' , '' , '' , '' , '' , '' , '' , '' , X'5443503408' , 0 , 0 , '' , 0 , 0 , 0 , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0),
(16 , 8 , 0 , '0000/00/00 00:00:00:00' , '' , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , '' , '' , '' , '' , '' , '' , '' , '' , '' , X'5443503408' , 0 , 0 , '' , 0 , 0 , 0 , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0);

INSERT INTO [AdminPrefs] ( [SpayClinic] , [VaxClinic] , [ShelterClinic] , [DateModified] , [Prefix] , [UpdateCounter] , [LockedRecs] , [dbName] , [Timer] , [MedCtrClinic] , [OtherClinic] , [Da2PPPx] , [Da2PPEPx] , [FVRCPPx] , [FVRCPEPx] , [FELVTPx] , [FELVTEPx] , [FELVVPx] , [FELVVEPx] , [HWTPx] , [HWTEPx] , [RabiesPx] , [RabiesEPx] , [FIVTest] , [FIVTestE] , [OnePlusChar] , [XSHWMPx] , [XSHWMEPx] , [SHWMPx] , [SHWMEPx] , [MHWMPx] , [MHWMEPx] , [LHWMPx] , [LHWMEPx] , [DebuggerOn] , [PayThisAmount] , [free6] , [XSHWMPillPx] , [XSHWMPillEPx] , [SHWMPillPx] , [SHWMPillEPx] , [MHWMPillPx] , [MHWMPillEPx] , [LHWMPillPx] , [LHWMPillEPx] , [free7] , [free8] , [free9] , [XSPMPx] , [XSPMEPx] , [SPMPx] , [SPMEPx] , [MPMPx] , [MPMEPx] , [LPMPx] , [LPMEPx] , [ReceiptFooter] , [MonthsUntilBenefits] , [free12] , [XSPMPillPx] , [XSPMPillEPx] , [SPMPillPx] , [SPMPillEPx] , [MPMPillPx] , [MPMPillEPx] , [LPMPillPx] , [LPMPillEPx] , [free14] , [ClinicName] , [ShelterName] , [ShelterAbbr] , [Address1] , [Address2] , [City] , [State] , [ZipCode] , [MainPhone] , [MainFax] , [SplashPict] , [free17] , [free18] , [LicenseNo] , [SerialNo] , [free20] , [free21] , [free22] , [VLogCC] , [SNLogCC] , [free23] , [free24] , [free25] , [AgeAndBDay] , [free26] , [free27] , [free28] , [CurrRouteNum] )
VALUES
(26 , 5 , 0 , '0000/00/00 00:00:00:00' , '' , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , '' , '' , '' , '' , '' , '' , '' , '' , '' , X'5443503408' , 0 , 0 , '' , 0 , 0 , 0 , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0),
(18 , 12 , 0 , '0000/00/00 00:00:00:00' , '' , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , '' , '' , '' , '' , '' , '' , '' , '' , '' , X'5443503408' , 0 , 0 , '' , 0 , 0 , 0 , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0),
(9 , 10 , 0 , '0000/00/00 00:00:00:00' , '' , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , '' , '' , '' , '' , '' , '' , '' , '' , '' , X'5443503408' , 0 , 0 , '' , 0 , 0 , 0 , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0),
(2 , 72 , 0 , '0000/00/00 00:00:00:00' , '' , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , '' , '' , '' , '' , '' , '' , '' , '' , '' , '' , X'5443503408' , 0 , 0 , '' , 0 , 0 , 0 , 0 , '' , '' , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0);
";

static void Main(string[] args)
{
    string query = @"^\(.*?\)(,|;)$";

    var matches = Regex.Matches(text, query, RegexOptions.Singleline | RegexOptions.Multiline);

    Console.WriteLine("Expected Matches: 8");
    Console.WriteLine("Matches Found: {0}", matches.Count);

    Console.ReadLine();
}

对于网站和我的代码(Multiline和Singleline)我的选项是完全一样的,他们应该使用相同的.NET正则表达式引擎,那么是什么造成了两者之间的区别呢?


最终结果:

对于所有那些好奇我的最终正则表达式

@"(?<=^\()                       # The beginning of a line followed by a (
((('(?<c>.*?)'(?!')(?=[\s\)])) | # Text string in SQL supports line breaks
  (?<c>-?[\d\.]+) |              # Any numbers
  (X'(?<c>[0-9a-f]*)')           # Something formatted like X'0123456789abcdef'
  )(\s,\s)?                      # Spaces and commas between the records
)+                               # Repeat the pattern at least one time
(?=(?<!'')\)[;,]\r?$)            # The End of the line ending with ); or ), and not immediately proceeded by ''";     

请注意所有计划将其用于R&amp; D(rip-off和deploy)开发的人,这仅适用于我的SQL,因为它非常规律。如果与我的第三方程序未生成的SQL一起使用,则需要调整以处理许多我不需要处理的边缘情况。

以下是解析器解析代码的完整代码。希望它能帮助那些陷入类似事情的其他人。

foreach (var tableFolder in Directory.GetDirectories(_exportFolder))
{
    //Popluate the schema of the DataTable
    DataTable table = new DataTable();
    using (SqlDataAdapter ada = new SqlDataAdapter(String.Format("Select top 0 * from [{0}]", Path.GetFileName(tableFolder)), conn))
    {
        ada.Fill(table);
    }

    //All of the files to import for this table
    string[] filePaths = Directory.GetFiles(tableFolder, "*.sql");

    foreach (string file in filePaths)
    {
        string text;
        using (var txtRdr = new StreamReader(file))
        {
            text = txtRdr.ReadToEnd();
        }

        const string recordRegex =
                        @"(?<=^\()                       #The begining of a line followed by a (
                        ((('(?<s>.*?)'(?!')(?=[\s\)])) | # Something formatted like 'some text' supports line breaks
                            (?<n>-?[\d\.]+) |            # Any numbers
                            (X'(?<h>[0-9a-f]*)')         # Something formatted like X'0123456789abcdef'
                            )(\s,\s)?                    # Spaces and commas between the records
                        )+                               # Repeat the pattern at least one time
                        (?=(?<!'')\)[;,]\r?$)            # The End of the line ending with ); or ), and not immedatly proceded by ''";            

        //Creates one match per row in the database
        var records = Regex.Matches(text, recordRegex, RegexOptions.Singleline | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase | RegexOptions.ExplicitCapture);

        const string headerRegex = @"^INSERT\sINTO\s\[[\w_\-\s]+\]\s\(\s(?:\[([\w_\-\s]+)\]\s(?:,\s)?)+\)";
        var header = Regex.Match(text, headerRegex).Groups[1].Captures.Cast<Capture>().ToArray();

        foreach (Match record in records)
        {
            //Due to how we captured the 3 groups we had to put them back in order in one list.
            var columns = record.Groups.Cast<Group>()
                                .Skip(1)  //Groups[0] contins the entire record.
                                .SelectMany(group => group.Captures.Cast<Capture>()) //Flattens all of the captures in the three groups in to one list
                                .OrderBy(capture => capture.Index) //Reorder the combined list as the SelectMany will not be outputting the correct order.
                                .ToArray(); 

            DataRow row = table.NewRow();
            for (int i = 0; i < columns.Length; i++)
            {
                Type columnType = table.Columns[header[i].Value].DataType;
                if (columnType == typeof(String))
                {
                    row[header[i].Value] = columns[i].Value;
                }
                else if (columnType == typeof(Int32))
                {
                    row[header[i].Value] = Convert.ToInt32(columns[i].Value);
                }
                else if (columnType == typeof(Double))
                {
                    row[header[i].Value] = Convert.ToDouble(columns[i].Value);
                }
                else if (columnType == typeof(Boolean))
                {
                    if (columns[i].Value == "0")
                        row[header[i].Value] = false;
                    else if (columns[i].Value == "1")
                        row[header[i].Value] = true;
                    else
                        throw new InvalidDataException();
                }
                else if (columnType == typeof(Int16))
                {
                    row[header[i].Value] = Convert.ToInt16(columns[i].Value);
                }
                else if (columnType == typeof(Byte[]))
                {
                    row[header[i].Value] = StringToByteArray(columns[i].Value);
                }
                else
                {
                    throw new NotImplementedException();
                }

            }
            table.Rows.Add(row);
        }

        using (var bulkCopy = new SqlBulkCopy(conn))
        {
            bulkCopy.DestinationTableName = Path.GetFileName(tableFolder);
            bulkCopy.BulkCopyTimeout = 0;
            bulkCopy.WriteToServer(table);
        }
    }
}

更新

通过将caputre组重命名为所有相同的名称,.NET的正则表达式引擎将它们组合起来,简化了

var columns = record.Groups[1].Cast<Group>().Skip(1).SelectMany(group => group.Captures.Cast<Capture>()).OrderBy(capture => capture.Index).ToArray();

var columns = record.Groups[1].Captures.Cast<Capture>().ToArray();

1 个答案:

答案 0 :(得分:6)

请注意,在Regex Hero页面上切换“CrLf标记行结束”设置会导致8行停止匹配;这是导致问题的原因的线索。

在C#代码中,文字字符串中的换行符被编码为CR / LF对("\r\n")。正则表达式中的$(与多行模式中的行尾匹配)仅匹配\n字符。因此,正则表达式未考虑的最终逗号(或分号)之间存在额外的\r字符,并且匹配失败。

您可以通过以下方式解决此问题:

  1. 剥去回车:text = text.Replace("\r\n", "\n");
  2. 匹配回车:string query = @"^\(.*?\)(,|;)\r$";