Question

我有很多包含ID的列表。筛选掉重复列表和另一个列表的子集的最佳方法是什么？我的问题是，随着列表的列表大小加倍，我的算法在时间上几乎成倍增加。

我已经尝试了ContainsCombinatie的多种变体，包括：

使用哈希集
使用SortedList

Check whether an array is a subset of another

下面是带有计时器的单元测试，供您试用。

    public class PerformanceTestThis
    {
        [Test]
        public void PerformanceTest2()
        {
            var allValues = new List<int>();
            for (int i = 0; i < 2000; i++)
            {
                allValues.Add(i);
            }

            var combinaties = new List<List<int>>();
            for (int i = 0; i < 10000; i++)
            {
                combinaties.Add(GenerateCombinatie(allValues));
            }

            Console.WriteLine($"Generated {combinaties.Count} combinaties");

            var stopwatch = Stopwatch.StartNew();
            var result = new CollectionFilter().FilterDoubles(combinaties);
            stopwatch.Stop();
            Console.WriteLine($"Filtered down to {result.Count} combinaties");

            Console.WriteLine(stopwatch.ElapsedMilliseconds);
        }

        private List<int> GenerateCombinatie(List<int> allVerstrekkingen)
        {
            var combinatie = new List<int>();
            var verstrekkingen = allVerstrekkingen.ToList();
            for (int i = 0; i < Generator.GetRandomNumber(1000); i++)
            {
                var verstrekking = verstrekkingen[Generator.GetRandomNumber(verstrekkingen.Count)];
                combinatie.Add(verstrekking);
                verstrekkingen.Remove(verstrekking);
            }

            return combinatie.OrderBy(x => x).ToList();
        }
    }

    public class CollectionFilter
    {
        public List<List<int>> FilterDoubles(List<List<int>> combinaties)
        {
            var withoutDoubles = new List<List<int>>();
            foreach (var current in combinaties.OrderByDescending(x => x.Count))
            {
                if (!withoutDoubles.Any(list => ContainsCombinatie(list, current)))
                {
                    withoutDoubles.Add(current);
                }
            }

            return withoutDoubles;
        }

        private bool ContainsCombinatie(List<int> list1, List<int> list2)
        {
            return list2.All(list1.Contains);
        }
    }

Answer 1

我提出以下方法：

创建“冲突”表

此表收集每个值前面的所有相关列表。完成后，某些值只有一个条目，另一些则有很多。

与先前的条目相交

对于每个列表，计算前一个表条目的交集（对于列表中的值）。如果交集只有一个元素（列表本身），那么它就不会是双精度元素。

   public class CollectionFilter2
    {
        public List<List<int>> FilterDoubles( List<List<int>> combinaties )
        {
            // First part: collects collisions for each value in the list
            // This is done using a dictionary that holds all concerned lists in front of each value
            var hitDictionary = new Dictionary<int, List<List<int>>>();
            foreach ( var comb in combinaties.Where( c => c.Count > 0 ) )
            {
                foreach ( var value in comb )
                {
                    if ( hitDictionary.TryGetValue( value, out var list ) == false )
                    {
                        list = new List<List<int>>();
                        hitDictionary[value] = list;
                    }

                    list.Add( comb );
                }
            }

            var result = new List<List<int>>();

            // Second part: search for lists for which one value has no collision
            foreach ( var comb in combinaties.Where( c => c.Count > 0 ) )
            {
                var count = comb.Count;

                // Initialize the intersection
                var inter = hitDictionary[comb[0]];

                // Makes the intersection for each value (or quit if the intersection is one list)
                for ( var i = 1 ; i < count && inter.Count > 1 ; i++ )
                    inter = inter.Intersect( hitDictionary[comb[i]] ).ToList();

                // If only one intersection, this is a result
                if ( inter.Count == 1 )
                    result.Add( comb );
            }

            return result;
        }
    }

有关信息，在我的PC上，以前的算法大约是8秒，而这个算法大约是0.7秒（问题中给出的计数相同）。

编辑：

考虑linq“ Intersect” implementation，以下是基于相同原理的优化版本：

public class CollectionFilter4
{
    class Temp
    {
        public List<int> Combinaty; // Original list
        public List<int> Values; // Distinct values
    }

    public List<List<int>> FilterDoubles( List<List<int>> combinaties )
    {
        // Generate distinct values
        var temps = combinaties.Where( c => c.Count > 0 ).Select( c => new Temp() { Combinaty = c, Values = c.Distinct().ToList() } ).ToList();

        // Collision dictionary (same as previous code)
        var hitDictionary = new Dictionary<int, List<Temp>>();
        foreach ( var temp in temps )
        {
            foreach ( var value in temp.Values )
            {
                if ( hitDictionary.TryGetValue( value, out var list ) == false )
                {
                    list = new List<Temp>();
                    hitDictionary[value] = list;
                }

                list.Add( temp );
            }
        }

        // Ascending sort on collision count (this has an impact on the intersection later, as we want to keep the shortest anyway)
        temps.ForEach( t => t.Values.Sort( ( a, b ) => hitDictionary[a].Count.CompareTo( hitDictionary[b].Count ) ) );

        var result = new List<Temp>();

        foreach ( var temp in temps )
        {
            var values = temp.Values;
            var count = values.Count;

            var inter = new HashSet<Temp>(); // Create a hashset from the first value
            foreach ( var t in hitDictionary[values[0]] ) inter.Add( t );

            for ( var i = 1 ; i < count && inter.Count > 1 ; i++ )
            {
                // Rewritten intersection
                inter = Intersect( hitDictionary[values[i]], inter );
            }

            if ( inter.Count == 1 )
                result.Add( temp );
        }

        return result.Select( r => r.Combinaty ).ToList();
    }

    // Same as original linq code except but optimized for this case
    static HashSet<TSource> Intersect<TSource>( IEnumerable<TSource> first, HashSet<TSource> second )
    {
        var result = new HashSet<TSource>();

        foreach ( TSource element in first )
            if ( second.Remove( element ) ) result.Add( element );

        return result;
    }
}

这是linq（更通用）的实现，以供参考：

static IEnumerable<TSource> IntersectIterator<TSource>(IEnumerable<TSource> first, IEnumerable<TSource> second, IEqualityComparer<TSource> comparer)
        {
            Set<TSource> set = new Set<TSource>(comparer);
            foreach (TSource element in second) set.Add(element);
            foreach (TSource element in first)
                if (set.Remove(element)) yield return element;
}

过滤掉重复列表和包含列表的最佳/高效方式

1 个答案: