使用LINQ GroupBy链产生具有聚合的多级层次结构

时间:2019-05-19 05:58:05

标签: c# linq

我碰巧这样做了,想到要问有关O(n),订单稳定性和枚举器实例的幕后保证。

我了解每次汇总的枚举拐点数量(例如计数和持续时间)会根据子级到父级的实际分布而有所不同,但至少每个汇总仅对每个记录进行一次枚举是不正确的?

在此示例中,我们有3个聚合和3个层次结构层次进行聚合,因此为O(9n)〜O(n)。

LINQ Group通过问题:

  1. GroupBy是否被证明具有线性复杂性和稳定的顺序,或者仅仅是Microsoft .NET的隐式实现方式吗?我没有任何特别的怀疑,它可能是非线性的,只是出于好奇而问。
  2. 实例化的枚举数似乎应该与层次结构中看到的父节点数始终成线性关系,是吗?
  3. 在相同的聚合统计和级别期间,没有枚举叶子或非叶子的次数超过一次,是吗?因为在某些单个“结果”中,每个节点只是一个父级的子级。
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using Microsoft.VisualStudio.TestTools.UnitTesting;

    namespace NestedGroupBy
    {
        [TestClass]
        public class UnitTest1
        {
            struct Record
            {
                // cat->subcat->event is the composite 3-tier key
                private readonly string category;
                private readonly string subcategory;
                private readonly string ev;
                private readonly double duration;

                public Record(string category, string subcategory, string ev, double duration)
                {
                    this.category = category;
                    this.subcategory = subcategory;
                    this.ev = ev;
                    this.duration = duration;
                }

                public string Category { get { return category; } }
                public string Subcategory { get { return subcategory; } }
                public string Event { get { return ev; } }
                public double Duration { get { return duration; } }
            }

            static readonly IList<Record> Rec1 = new System.Collections.ObjectModel.ReadOnlyCollection<Record>
            (new[] {
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "Security", "ReadSocket", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("File", "ReadMetadata", "ReadDirectory", 0.0145),
                 new Record ("File", "ReadMetadata", "ReadDirectory", 0.0145),
                 new Record ("File", "ReadMetadata", "ReadDirectory", 0.0145),
                 new Record ("File", "ReadMetadata", "ReadSize", 0.0145),
                 new Record ("File", "ReadMetadata", "ReadSize", 0.0145),
                 new Record ("File", "ReadMetadata", "ReadSize", 0.0145),
                 new Record ("Registry", "ReadKey", "ReadKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "ReadKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "ReadKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "ReadKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "CacheKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "CacheKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "CheckKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "CheckKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "CheckKeyAcl", 0.0145),
                 new Record ("Registry", "WriteKey", "CheckPermissions", 0.0145),
                 new Record ("Registry", "WriteKey", "CheckOwner", 0.0145),
                 new Record ("Registry", "WriteKey", "InheritPermissions", 0.0145),
                 new Record ("Registry", "WriteKey", "ValidateKey", 0.0145),
                 new Record ("Registry", "WriteKey", "RecacheKey", 0.0145),
                 new Record ("File", "WriteData", "FlushData", 0.0145),
                 new Record ("File", "WriteData", "WriteBuffer", 0.0145),
                 new Record ("File", "WriteData", "WritePermissions", 0.0145),
                 new Record ("File", "ReadData", "CheckDataBuffer", 0.0145),
                 new Record ("File", "ReadData", "ReadBuffer", 0.0145),
                 new Record ("File", "ReadData", "ReadPermissions", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "Security", "ReadSocket", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "Security", "ReadSocket", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
            });

            [TestMethod]
            public void TestMethod1()
            {
                // Perform one big sort to order all child rungs properly
                var s = Rec1.OrderBy(
                    r => r,
                    Comparer<Record>.Create(
                        (x, y) =>
                        {
                            int c = x.Category.CompareTo(y.Category);
                            if (c != 0) return c;
                            c = x.Subcategory.CompareTo(y.Subcategory);
                            if (c != 0) return c;
                            return x.Event.CompareTo(y.Event);
                        }));

                // This query enumerates bottom-up (in the key hierarchy-sense), 
                // then proceedes to each higher summary (parent) level and retains
                // the "result" collection of its children determined by the preceding GroupBy.
                //
                // This is so each level can later step down into its own children for looping. 
                // And the leaf durations, immediate child counts and leaf event counts are already calculated as well.
                //
                // I think this is O(n), since each record does not get repeatedly scanned for different levels of the same accumulation stat.
                // But under-the-hood there may be much grainy processing like enumerator instantiation, depending on child count density.
                var q = s
                    .GroupBy(
                        r => new { Category = r.Category, Subcategory = r.Subcategory, Event = r.Event },
                        (key, result) => {
                            int c = result.Count();
                            return new
                            {
                                LowKey = key,
                                // at this lowest summary level only, 
                                // the hierarchical (immediate child) count is the same as the event (leaf) count
                                LowChildCount = c,
                                LowEventCount = c,
                                LowDuration = result.Sum(x => x.Duration),
                                LowChildren = result
                            };
                        })
                        .GroupBy(
                            r => new { Category = r.LowKey.Category, Subcategory = r.LowKey.Subcategory },
                                (key, result) => new {
                                    MidKey = key,
                                    MidChildCount = result.Count(),
                                    MidEventCount = result.Sum(x => x.LowEventCount),
                                    MidDuration = result.Sum(x => x.LowDuration),
                                    MidChildren = result
                                })
                                .GroupBy(
                                    r => new { Category = r.MidKey.Category },
                                    (key, result) => new {
                                        HighKey = key,
                                        HighChildCount = result.Count(),
                                        HighEventCount = result.Sum(x => x.MidEventCount),
                                        HighDuration = result.Sum(x => x.MidDuration),
                                        HighChildren = result
                                    });


                foreach (var high in q)
                {
                    Console.WriteLine($"{high.HighKey.Category} child#={high.HighChildCount} event#={high.HighEventCount} duration={high.HighDuration}");
                    foreach (var mid in high.HighChildren)
                    {
                        Console.WriteLine($"  {mid.MidKey.Subcategory} child#={mid.MidChildCount} event#={high.HighEventCount} duration={mid.MidDuration}");
                        foreach (var low in mid.MidChildren)
                        {
                            Console.WriteLine($"    {low.LowKey.Event} child#={low.LowChildCount} event#={high.HighEventCount} duration={low.LowDuration}");
                            foreach (var leaf in low.LowChildren)
                            {
                                Console.WriteLine($"      >> {leaf.Category}/{leaf.Subcategory}/{leaf.Event} duration={leaf.Duration}");
                            }
                        }
                    }
                }
            }
        }
    }

1 个答案:

答案 0 :(得分:1)

让我们首先从MS源代码中回顾Enumerable.GroupBy的实现:

分组依据

public static IEnumerable<TResult> GroupBy<TSource, TKey, TElement, TResult>
(this IEnumerable<TSource> source, 
 Func<TSource, TKey> keySelector, 
 Func<TSource, TElement> elementSelector, 
 Func<TKey, IEnumerable<TElement>, TResult> resultSelector)
{
  return new GroupedEnumerable<TSource, TKey, TElement, TResult>(source, 
                                                                 keySelector, 
                                                                 elementSelector, 
                                                                 resultSelector, null);
}

GroupedEnumerable

internal class GroupedEnumerable<TSource, TKey, TElement, TResult> : IEnumerable<TResult>{
    IEnumerable<TSource> source;
    Func<TSource, TKey> keySelector;
    Func<TSource, TElement> elementSelector;
    IEqualityComparer<TKey> comparer;
    Func<TKey, IEnumerable<TElement>, TResult> resultSelector;

    public GroupedEnumerable(IEnumerable<TSource> source, Func<TSource, TKey> keySelector, Func<TSource, TElement> elementSelector, Func<TKey, IEnumerable<TElement>, TResult> resultSelector, IEqualityComparer<TKey> comparer){
        if (source == null) throw Error.ArgumentNull("source");
        if (keySelector == null) throw Error.ArgumentNull("keySelector");
        if (elementSelector == null) throw Error.ArgumentNull("elementSelector");
        if (resultSelector == null) throw Error.ArgumentNull("resultSelector");
        this.source = source;
        this.keySelector = keySelector;
        this.elementSelector = elementSelector;
        this.comparer = comparer;
        this.resultSelector = resultSelector;
    }

    public IEnumerator<TResult> GetEnumerator(){
        Lookup<TKey, TElement> lookup = Lookup<TKey, TElement>.Create<TSource>(source, keySelector, elementSelector, comparer);
        return lookup.ApplyResultSelector(resultSelector).GetEnumerator();
    }

    IEnumerator IEnumerable.GetEnumerator(){
        return GetEnumerator();
    }
}

重要的一点是它在内部使用LookUp数据结构,该结构为关键字提供O(1)查找,因此它在内部枚举了所有记录,并为每个记录将数据添加到{{1 }},类型为LookUp

现在您的特定问题

  

是否已将GroupBy记录为线性复杂度和稳定顺序,还是Microsoft .NET的隐式实现方式?我没有任何特别的怀疑,它可能是非线性的,只是出于好奇而问。

设计和代码建议在顶层使用O(N),但可以肯定的是,它取决于元素选择,如果进一步执行IEnumerable<IGrouping<TKey,TValue>>操作,则会自动使复杂度O(N)或更多

  

似乎枚举的枚举数应始终与层次结构中看到的父节点数成线性关系,对吗?

是的,它只是顶层的单个枚举器

  

在相同的聚合统计和级别期间,没有枚举任何叶子或非叶子超过一次,是吗?因为在某些单个“结果”中,每个节点只是一个父级的子级。

通过非叶节点,我假设您是指嵌套数据,因此这取决于您的模型设计,但是如果每个元素都按预期有其嵌套数据的单独副本,而不是共享/引用,那么我不会看看为什么一个元素会被多次触摸,它是特定子/子数据的枚举

更多详细信息

  • 在您的情况下,您正在编写O(N^2)查询,因此,一个查询的结果将作为另一个查询的输入,就复杂性而言,它更像是GroupByO(N) + O(N) + O(N),但在其中每一个内部至少要进行1个O(N)操作,因此,由于您的嵌套设计,总体复杂度应为O(N)而不是O(N^2)
  • 如果您有深层嵌套的数据,则最好使用级联的O(N)GroupBy,因为它对在扁平化后的数据上进行最终聚合的操作较为简单,因此可以对数据进行扁平化。 li>