通过正则表达式匹配来对相似类型的消息进行分组的算法?

时间:2017-09-04 10:31:41

标签: c# algorithm parallel-processing task-parallel-library

我有一个预定义的正则表达式模式字符串列表(大约7 thousand类型的正则表达式模式,用于对相似类型的消息进行分组)。

现在我有两个设置为regex patterns列出一个,为real messages列出一个包含一些变量名称的设置。

我需要对所有类似的消息进行分组并显示这些分组的消息,现在我已遍历7000个正则表达式模式,以便对1000条消息中的类似项进行分组。需要m*n iterations才能找到正确的群组。

为减少处理时间,我从邮件列表中删除了匹配的项目。例如1000 - (matched items on the previous iteration)

处理这两个列表需要花费太多时间。为了减少时间,我将其分组在消息类别类型上并在并行任务中处理它们。

List<KBError> warningKBErrors = distinctKBErrors.Where(kbErr => kbErr.ErrorType == "Warning").ToList();
List<KBError> fatalKBErrors = distinctKBErrors.Where(kbErr => kbErr.ErrorType == "Fatal").ToList();
List<KBError> severeKBErrors = distinctKBErrors.Where(kbErr => kbErr.ErrorType == "Severe").ToList();
List<KBError> cbeccErrorKBErrors = distinctKBErrors.Where(kbErr => kbErr.ErrorType == "Error").ToList();

//Remove All error message which should be processed
errors.RemoveAll(error => !processingErrorType.HasFlag(error.ErrorType));

List<Error> warningErrors = errors.Where(kbErr => kbErr.ErrorType == ErrorType.Warning).ToList();
List<Error> fatalErrors = errors.Where(kbErr => kbErr.ErrorType == ErrorType.Fatal).ToList();
List<Error> severeErrors = errors.Where(kbErr => kbErr.ErrorType == ErrorType.Severe).ToList();
List<Error> cbeccErrors = errors.Where(kbErr => kbErr.ErrorType ==ErrorType.Error).ToList();

之后,通过将它们分配到相同的项子集中,在并行任务中处理这些消息。

Func<List<KBError>, List<Error>, List<Error>> FindDistinctErrorMessages = (filteredKBErros, filteredErros) =>
{
    ConcurrentBag<Error> errorsList = new ConcurrentBag<Error>();


    object lockObject = new object();

    System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
    sw.Start();


    Parallel.For(0, filteredKBErros.Count,
        () => new Dictionary<KBError, List<Error>>(),
        (x, loopState, kpErrorResult) =>
        {
            kpErrorResult.Add(filteredKBErros[(int)x], filteredErros
                .Where(error => Regex.IsMatch(error.ErrorMessage,
                    filteredKBErros[(int)x].ErrorMessage, System.Text.RegularExpressions.RegexOptions.IgnorePatternWhitespace)).ToList());
            return kpErrorResult;
        },
        (kpErrorResult) =>
        {
            lock (lockObject)
            {
                foreach (KeyValuePair<KBError, List<Error>> errorResult in kpErrorResult)
                {
                    if (errorResult.Value.Count > 0)
                    {
                        Error error = null;
                        if (errorResult.Value.Count == 1)
                        {
                            error = errorResult.Value.First();
                        }
                        else
                        {
                            error = new Error();
                            error.ErrorMessage = errorResult.Value.First().ErrorMessage;                                         
                            error.Errors = errorResult.Value;
                            error.ErrorType = errorResult.Value.First().ErrorType;
                        }
                        error.ErrorCount = errorResult.Value.Count;
                        error.ErrorCode = errorResult.Key.ErrorCode;
                        AddErrorResolutionMessage(error, errorResult.Key);
                        error.ErrorMessagePattern = errorResult.Key.ErrorMessage;
                        errors.Add(error);
                        errorResult.Value.ForEach(err => errors.Remove(err));
                    }
                }
            }
        }
        );
    sw.Stop();
    System.Diagnostics.Debug.WriteLine(string.Format("Completed in {0} seconds", sw.Elapsed.TotalSeconds));

    return errors.ToList();

};


//Filter the Warning KB List
List<KBError> filteredWarningKBList = FilterKBList(warningKBErrors, warningErrors);
List<KBError> filteredSevereKBList = FilterKBList(severeKBErrors, severeErrors);
List<KBError> filteredFatalKBList = FilterKBList(fatalKBErrors, fatalErrors);
List<KBError> filteredcbeccErrorsKBList = FilterKBList(cbeccErrorKBErrors, cbeccErrors);


List<Task<List<Error>>> tasks = new List<Task<List<Error>>>();

if (warningErrors.Count > 0 && (processingErrorType.HasFlag(ErrorType.Warning) || processingErrorType.Equals(ErrorType.All)))
{
    int equalCounts = warningErrors.Count < 10 ? 1 : warningErrors.Count / 10;
    foreach (IEnumerable<Error> subSet in warningErrors.Split(equalCounts))
    {
        tasks.Add(Task.Run<List<Error>>(() => FindDistinctErrorMessages(filteredWarningKBList, subSet.ToList()), CancellationToken.None));
    }
}

if (severeErrors.Count > 0 && (processingErrorType.HasFlag(ErrorType.Severe) || processingErrorType == ErrorType.All))
{
    int equalCounts = severeErrors.Count < 10 ? 1 : severeErrors.Count / 10;
    foreach (IEnumerable<Error> subSet in severeErrors.Split(equalCounts))
    {
        tasks.Add(Task.Run<List<Error>>(() => FindDistinctErrorMessages(filteredSevereKBList, subSet.ToList()), CancellationToken.None));
    }
}

if (fatalErrors.Count > 0 && (processingErrorType.HasFlag(ErrorType.Fatal) || processingErrorType.Equals(ErrorType.All)))
{
    int equalCounts = fatalErrors.Count < 10 ? 1 : fatalErrors.Count / 10;
    foreach (IEnumerable<Error> subSet in fatalErrors.Split(equalCounts))
    {
        tasks.Add(Task.Run<List<Error>>(() => FindDistinctErrorMessages(filteredFatalKBList, subSet.ToList()), CancellationToken.None));
    }
}

if (cbeccErrors.Count > 0 && (processingErrorType.HasFlag(ErrorType.Error) || processingErrorType.Equals(ErrorType.All)))
{
    int equalCounts = cbeccErrors.Count < 10 ? 1 : cbeccErrors.Count / 10;
    foreach (IEnumerable<Error> subSet in cbeccErrors.Split(equalCounts))
    {
        tasks.Add(Task.Run<List<Error>>(() => FindDistinctErrorMessages(filteredcbeccErrorsKBList, subSet.ToList()), CancellationToken.None));
    }
}

启动这些任务后,需要很多时间才能完成这些任务。这些创建的任务的wait语句以某种方式将应用程序置于挂起状态。

try
{
    List<Error> result = new List<Error>();
    Task.WaitAll(tasks.ToArray());
    foreach (var task in tasks)
    {
        result.AddRange(task.Result);
    }
    result = result.Distinct().ToList();
    result.GroupBy(res => res.ErrorMessagePattern).ToList()
        .ForEach(grp =>
        {
            Error error = grp.First();
            error.ErrorCount = grp.Sum(r => r.ErrorCount);
            if (grp.Count() > 1)
            {
                grp.ToList().ForEach(grpElement =>
                {
                    if (grpElement != error)
                    {
                        if (error.Errors == null)
                            error.Errors = new List<Error>();
                        grpElement.ErrorCount = 1;

                        if (grpElement.Errors != null && grpElement.Errors.Count > 0)
                        {
                            error.Errors.AddRange(grpElement.Errors);
                            grpElement.Errors = null;
                        }
                    }
                });
            }
            distinctErrors.Add(error);
        });
}
finally
{

}

errors.ForEach(error =>
{
    error.ErrorCount = 1;
    AddErrorResolutionMessage(error, null);
    distinctErrors.Add(error);

    if (error.PossibleResolution == "Not Found")
        logMessage.AppendLine(error.ErrorMessage);

});
  

是否有更好的方法或算法来缩短处理时间   这些列表并减少了流程的时间复杂度   处理mxn元素?

0 个答案:

没有答案