我有很多包含ID的列表。筛选掉重复列表和另一个列表的子集的最佳方法是什么?我的问题是,随着列表的列表大小加倍,我的算法在时间上几乎成倍增加。
我已经尝试了ContainsCombinatie的多种变体,包括:
下面是带有计时器的单元测试,供您试用。
public class PerformanceTestThis
{
[Test]
public void PerformanceTest2()
{
var allValues = new List<int>();
for (int i = 0; i < 2000; i++)
{
allValues.Add(i);
}
var combinaties = new List<List<int>>();
for (int i = 0; i < 10000; i++)
{
combinaties.Add(GenerateCombinatie(allValues));
}
Console.WriteLine($"Generated {combinaties.Count} combinaties");
var stopwatch = Stopwatch.StartNew();
var result = new CollectionFilter().FilterDoubles(combinaties);
stopwatch.Stop();
Console.WriteLine($"Filtered down to {result.Count} combinaties");
Console.WriteLine(stopwatch.ElapsedMilliseconds);
}
private List<int> GenerateCombinatie(List<int> allVerstrekkingen)
{
var combinatie = new List<int>();
var verstrekkingen = allVerstrekkingen.ToList();
for (int i = 0; i < Generator.GetRandomNumber(1000); i++)
{
var verstrekking = verstrekkingen[Generator.GetRandomNumber(verstrekkingen.Count)];
combinatie.Add(verstrekking);
verstrekkingen.Remove(verstrekking);
}
return combinatie.OrderBy(x => x).ToList();
}
}
public class CollectionFilter
{
public List<List<int>> FilterDoubles(List<List<int>> combinaties)
{
var withoutDoubles = new List<List<int>>();
foreach (var current in combinaties.OrderByDescending(x => x.Count))
{
if (!withoutDoubles.Any(list => ContainsCombinatie(list, current)))
{
withoutDoubles.Add(current);
}
}
return withoutDoubles;
}
private bool ContainsCombinatie(List<int> list1, List<int> list2)
{
return list2.All(list1.Contains);
}
}
答案 0 :(得分:1)
我提出以下方法:
此表收集每个值前面的所有相关列表。 完成后,某些值只有一个条目,另一些则有很多。
对于每个列表,计算前一个表条目的交集(对于列表中的值)。 如果交集只有一个元素(列表本身),那么它就不会是双精度元素。
public class CollectionFilter2
{
public List<List<int>> FilterDoubles( List<List<int>> combinaties )
{
// First part: collects collisions for each value in the list
// This is done using a dictionary that holds all concerned lists in front of each value
var hitDictionary = new Dictionary<int, List<List<int>>>();
foreach ( var comb in combinaties.Where( c => c.Count > 0 ) )
{
foreach ( var value in comb )
{
if ( hitDictionary.TryGetValue( value, out var list ) == false )
{
list = new List<List<int>>();
hitDictionary[value] = list;
}
list.Add( comb );
}
}
var result = new List<List<int>>();
// Second part: search for lists for which one value has no collision
foreach ( var comb in combinaties.Where( c => c.Count > 0 ) )
{
var count = comb.Count;
// Initialize the intersection
var inter = hitDictionary[comb[0]];
// Makes the intersection for each value (or quit if the intersection is one list)
for ( var i = 1 ; i < count && inter.Count > 1 ; i++ )
inter = inter.Intersect( hitDictionary[comb[i]] ).ToList();
// If only one intersection, this is a result
if ( inter.Count == 1 )
result.Add( comb );
}
return result;
}
}
有关信息,在我的PC上,以前的算法大约是8秒,而这个算法大约是0.7秒(问题中给出的计数相同)。
编辑:
考虑linq“ Intersect” implementation,以下是基于相同原理的优化版本:
public class CollectionFilter4
{
class Temp
{
public List<int> Combinaty; // Original list
public List<int> Values; // Distinct values
}
public List<List<int>> FilterDoubles( List<List<int>> combinaties )
{
// Generate distinct values
var temps = combinaties.Where( c => c.Count > 0 ).Select( c => new Temp() { Combinaty = c, Values = c.Distinct().ToList() } ).ToList();
// Collision dictionary (same as previous code)
var hitDictionary = new Dictionary<int, List<Temp>>();
foreach ( var temp in temps )
{
foreach ( var value in temp.Values )
{
if ( hitDictionary.TryGetValue( value, out var list ) == false )
{
list = new List<Temp>();
hitDictionary[value] = list;
}
list.Add( temp );
}
}
// Ascending sort on collision count (this has an impact on the intersection later, as we want to keep the shortest anyway)
temps.ForEach( t => t.Values.Sort( ( a, b ) => hitDictionary[a].Count.CompareTo( hitDictionary[b].Count ) ) );
var result = new List<Temp>();
foreach ( var temp in temps )
{
var values = temp.Values;
var count = values.Count;
var inter = new HashSet<Temp>(); // Create a hashset from the first value
foreach ( var t in hitDictionary[values[0]] ) inter.Add( t );
for ( var i = 1 ; i < count && inter.Count > 1 ; i++ )
{
// Rewritten intersection
inter = Intersect( hitDictionary[values[i]], inter );
}
if ( inter.Count == 1 )
result.Add( temp );
}
return result.Select( r => r.Combinaty ).ToList();
}
// Same as original linq code except but optimized for this case
static HashSet<TSource> Intersect<TSource>( IEnumerable<TSource> first, HashSet<TSource> second )
{
var result = new HashSet<TSource>();
foreach ( TSource element in first )
if ( second.Remove( element ) ) result.Add( element );
return result;
}
}
这是linq(更通用)的实现,以供参考:
static IEnumerable<TSource> IntersectIterator<TSource>(IEnumerable<TSource> first, IEnumerable<TSource> second, IEqualityComparer<TSource> comparer)
{
Set<TSource> set = new Set<TSource>(comparer);
foreach (TSource element in second) set.Add(element);
foreach (TSource element in first)
if (set.Remove(element)) yield return element;
}