Question

我需要知道达到上述结果的最佳方法是什么

我有以下课程：

public class Log
{
    public HashSet<string> Ids { get; set; }
    public string UniqueId { get; set; }
}
public class GroupModel 
{
    public List<Log> Logs { get; set; }
}

数据设置如下：

GroupModel webApiGroupModel = new GroupModel()
        {
            Logs = new List<Log>()
            {
                new Log()
                {
                    Ids = new HashSet<string>(){"a","g"},
                    UniqueId = "1"

                },
                new Log()
                {
                    Ids = new HashSet<string>(){"b", "c" },
                    UniqueId = "2"

                },
                new Log()
                {
                    Ids = new HashSet<string>(){"a", "b"},
                    UniqueId = "3"

                },
                new Log()
                {
                    Ids = new HashSet<string>(){"e"},
                    UniqueId = "4"

                },
                new Log()
                {
                    Ids = new HashSet<string>(){ "d", "e" },
                    UniqueId = "5"

                },
                new Log()
                {
                    Ids = new HashSet<string>(){ "f"},
                    UniqueId = "6"

                },
                new Log()
                {
                    Ids = new HashSet<string>(){ "g"},
                    UniqueId = "7"

                },
                new Log()
                {
                    Ids = new HashSet<string>(){ "a", "g" },
                    UniqueId = "8"

                },
                new Log()
                {
                    Ids = new HashSet<string>(){ "h", "e","g" },
                    UniqueId = "9"

                },
                new Log()
                {
                    Ids = new HashSet<string>(){ },//Intentionally left blank
                    UniqueId = "10"

                },
            }
        };

我需要根据相关ID对它们进行分组，因此结果将如下所示：

Group1 =具有UniqueId 1,2,3,4,5,7,8,9的列表

Group2 =具有唯一ID 6的列表

Group3 =具有唯一ID 10的列表

说明：

Group1：如果任何Log与任何Ids项匹配，则必须将它们分组为一个。由于具有唯一ID 1的日志具有ID“ a，g”，它们存在于具有唯一ID 3,7,8,9的日志中，因此将它们组合在一起，但是还有3,7,8,9个其他项，即“ b，e”存在于2、4、5中，因此所有这些都被分组，即1,2,3,4,5,7,8,9

Group2,3 ID在任何日志中均不存在，因此它们位于唯一的组中。

我正在尝试以最佳方式实现这一目标，因为我使用的解决方案需要2分钟才能对包含25K日志的数据集进行分组，这非常糟糕。每个日志中的最大ID数可以为3。

下面是我的解决方案，您可以帮助我进行优化，或者提供一种完全不同的方法来实现上述结果。

private static ICollection<List<Log>> GroupIds(List<Log> logs)
    {
        Dictionary<int, HashSet<string>> tempgroupedIds = new Dictionary<int, HashSet<string>>();
        HashSet<string> UniqueIds = new HashSet<string>();
        HashSet<string> tempgroupedIds2 = new HashSet<string>();
        int id = 1;

        foreach (var log in logs)
        {
            List<int> tempGroupNames = new List<int>();
            foreach (var Id in log.Ids)
            {
                if (!string.IsNullOrEmpty(Id))
                {
                    UniqueIds = new HashSet<string>(logs
                        .Where(d => d.Ids.Contains(Id)).Select(p => p.UniqueId));

                    var OtherLogsContainingUniqueIds = tempgroupedIds
                        .Where(d => UniqueIds != null && d.Value.Intersect(UniqueIds).Any())
                        .Select(d => d.Key);
                    if (OtherLogsContainingUniqueIds.Any())
                    {
                        tempGroupNames.AddRange(OtherLogsContainingUniqueIds.ToList());
                    }
                }

                if (tempGroupNames.Any())
                {
                    var tempCorelationids = new HashSet<string>(tempgroupedIds.Where(d => tempGroupNames.Contains(d.Key)).SelectMany(a => a.Value));
                    tempCorelationids.UnionWith(UniqueIds);

                    foreach (var groupname in tempGroupNames)
                    {
                        //If id is found in exiting group put all the Ids from this group to the existing group and remove this group
                        tempgroupedIds.Remove(groupname);
                    }

                    tempgroupedIds2.UnionWith(UniqueIds);
                    tempgroupedIds.Add(id, tempCorelationids);
                    id++;
                }
                else
                {
                    // This a unique group untill some other log is found containing this Id
                    tempgroupedIds2.UnionWith(UniqueIds);
                    tempgroupedIds
                        .Add(id, UniqueIds);
                    id++;
                }
            }
        }

        ICollection<List<Log>> finalGroup = new Collection<List<Log>>();

        foreach (var groupedlogKey in tempgroupedIds)
        {
            var group = logs
                .Where(a => groupedlogKey.Value.Contains(a.UniqueId)).ToList();
            finalGroup.Add(group);
        }

        //With empty Ids
        var anonymousLogs = logs
            .Where(a => !tempgroupedIds2.Contains(a.UniqueId)).ToList();
        if (anonymousLogs.Count >= 1)
        {
            finalGroup.Add(anonymousLogs);
        }

        return finalGroup;
    }
}

Answer 1

您的算法在输入列表中包含许多内部线性运算，这使其二次O(N*N*K)的时间复杂度（K的漏洞足以影响性能）。

关键时刻是通过合并每个元素id和每个相交元素id来确定相关ID的唯一集合。

为了有效地做到这一点，我们将使用单遍构建这样的结构

var idSetById = new Dictionary<string, HashSet<string>>();

以下约束成立的地方：

foreach (var item in idSetById)
{
     Debug.Assert(item.Value.Contains(item.Key));
     foreach (var id in item.Value)
         Debug.Assert(idSetById.ContainsKey(id) && idSetById[id] == idSet);
}

为什么？首先，因为它可以在线性时间内创建，其次，上述约束使其可以用作有效的GroupBy键选择器

.GroupBy(log => idSetById[log.Ids.First()])

（注意：以上内容适用于非空id集。这些空集将在源迭代过程中简单地添加到单独的列表中）。

这是完整的方法：

private static ICollection<List<Log>> GroupIds(List<Log> logs)
{
    var emptyIdsGroup = new List<Log>();
    var idSetById = new Dictionary<string, HashSet<string>>();
    var mergeSets = new HashSet<HashSet<string>>();
    foreach (var log in logs)
    {
        if (log.Ids.Count == 0)
        {
            emptyIdsGroup.Add(log);
            continue;
        }
        HashSet<string> idSet = null;
        mergeSets.Clear();
        foreach (var id in log.Ids)
        {
            HashSet<string> mergeSet;
            if (idSetById.TryGetValue(id, out mergeSet))
                mergeSets.Add(mergeSet);
            else
            {
                if (idSet == null) idSet = new HashSet<string>();
                idSet.Add(id);
                idSetById.Add(id, idSet);
            }
        }
        foreach (var mergeSet in mergeSets)
        {
            if (idSet == null)
                idSet = mergeSet;
            else
            {
                // Merge the set with less elements into the set with more elements
                HashSet<string> fromSet;
                if (idSet.Count >= mergeSet.Count)
                    fromSet = mergeSet;
                else
                {
                    fromSet = idSet;
                    idSet = mergeSet;
                }
                foreach (var id in fromSet)
                {
                    idSet.Add(id);
                    idSetById[id] = idSet;
                }
            }
        }
    }

    var groups = logs
        .Where(log => log.Ids.Count > 0)
        .GroupBy(log => idSetById[log.Ids.First()], (key, group) => group.ToList())
        .ToList();

    if (emptyIdsGroup.Count > 0) groups.Add(emptyIdsGroup);

    return groups;
}

由于构建字典和GroupBy操作都具有线性时间复杂度，因此该算法的时间复杂度也是线性的。在包含25,000条日志的列表上运行它需要花费毫秒。

编辑：以上内容足够好，但可以进一步优化。合并集合时，潜在的昂贵操作正在更新字典：

foreach (var id in fromSet)
{
    idSet.Add(id);
    idSetById[id] = idSet; // <--
}

可以通过将临时结构内部的HashSet<string>替换为类{em> holding 一个HashSet<string>并在外部进行设置，以节省一些额外的内存：

private class IdSet
{
    public HashSet<string> Ids = new HashSet<string>();
}

所以我们可以改用这样的东西：

idSet.Ids.UnionWith(fromSet.Ids); // merge content
fromSet.Ids = idSet.Ids; // and make both objects have the same content

这还使我们能够单次执行合并操作，而无需使用mergeSets变量。

这里是更新的方法（请注意，与第一个实现相反，我们需要使用IdSet.Ids值作为分组键而不是IdSet对象）：

private static ICollection<List<Log>> GroupIds(List<Log> logs)
{
    var emptyIdsGroup = new List<Log>();
    var idSetById = new Dictionary<string, IdSet>();
    foreach (var log in logs)
    {
        if (log.Ids.Count == 0)
        {
            emptyIdsGroup.Add(log);
            continue;
        }
        IdSet idSet = null;
        foreach (var id in log.Ids)
        {
            IdSet mergeSet;
            if (!idSetById.TryGetValue(id, out mergeSet))
            {
                if (idSet == null) idSet = new IdSet();
                idSet.Ids.Add(id);
                idSetById.Add(id, idSet);
            }
            else if (idSet == null)
                idSet = mergeSet;
            else if (idSet.Ids != mergeSet.Ids)
            {
                // Merge the set with less elements into the set with more elements
                if (idSet.Ids.Count >= mergeSet.Ids.Count)
                {
                    idSet.Ids.UnionWith(mergeSet.Ids);
                    mergeSet.Ids = idSet.Ids;
                }
                else
                {
                    mergeSet.Ids.UnionWith(idSet.Ids);
                    idSet.Ids = mergeSet.Ids;
                    idSet = mergeSet;
                }
            }
        }
    }

    var groups = logs
        .Where(log => log.Ids.Count > 0)
        .GroupBy(log => idSetById[log.Ids.First()].Ids, (key, group) => group.ToList())
        .ToList();

    if (emptyIdsGroup.Count > 0) groups.Add(emptyIdsGroup);

    return groups;
}

使用linq分组相关列表

1 个答案: