我有近100万件物品,每件物品都有一套独特的Int32我会叫idS
每对项目(1百万x(1百万-1))
我需要一个匹配
这是我现在正在做的事情
foreach (Item item1 in Items)
{
HashSet<Int32> idS1 = item1.idS;
foreach(Item item2 in Items.Where(x => x.ID > item1.ID)
{
HashSet<Int32> idS2 = item2.idS;
Int32 matchCount = 0;
if (idS1.Count < idS2.Count)
{
foreach(Int32 i1 in idS1)
if(idS2.Contains(i1)) matchCount++;
}
else
{
foreach(Int32 i2 in idS2)
if(idS1.Contains(i2)) matchCount++;
}
}
}
有没有更快的方法?
我试过IntersectWith并且速度较慢
我不需要实际的交叉点 - 只需要计算。
对生产负载运行并不感兴趣,因此希望将其限制为单个CPU
关于有序集合,有更多
我对低交叉点不感兴趣
如果大数是> 4 *低计数我不跳过
当我从SQL读入时,我按计数进行排序
我删除了低计数(我有3个滚动集,其中集合2是两次计数,而集合3是4 x计数)
我需要滚动设置才能留在OOM
这就是一切
public void Logon()
{
sqlCon1 = new SqlConnection(connString);
sqlCon2 = new SqlConnection(connString);
sqlCon3 = new SqlConnection(connString);
try
{
sqlCon1.Open();
sqlCon2.Open();
sqlCon3.Open();
}
catch (SqlException ex)
{
MessageBox.Show(ex.Message);
}
SqlCommand sqlcmd1 = sqlCon1.CreateCommand();
SqlCommand sqlcmd2 = sqlCon1.CreateCommand();
SqlCommand sqlcmd3 = sqlCon1.CreateCommand();
sqlcmd1.CommandText = "SELECT [FTSindexWordOnce].[sID], [FTSindexWordOnce].[wordID], [docSVsys].[textUniqueWordCount]" + Environment.NewLine +
" FROM [FTSindexWordOnce] with (nolock)" + Environment.NewLine +
" JOIN [docSVsys] with (nolock)" + Environment.NewLine +
" ON [docSVsys].[sID] = [FTSindexWordOnce].[sID] " + Environment.NewLine +
" WHERE [docSVsys].[textUniqueWordCount] > 10" + Environment.NewLine +
" ORDER BY [docSVsys].[textUniqueWordCount] asc, [FTSindexWordOnce].[sID], [FTSindexWordOnce].[wordID]";
List<DocUniqueWords> docsA = new List<DocUniqueWords>();
List<DocUniqueWords> docsB = new List<DocUniqueWords>();
List<DocUniqueWords> docsC = new List<DocUniqueWords>();
List<DocUniqueWords> docsActive = docsA;
List<DocUniqueWords> docs0 = new List<DocUniqueWords>();
List<DocUniqueWords> docs1 = new List<DocUniqueWords>();
DocUniqueWords doc = new DocUniqueWords(0);
Int32 sID;
Int32 textUniqueWordCount;
Int32 maxTextUniqueWordCount = 20;
Int32 ccount = 0;
byte textUniqueWordIter = 0;
System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
sw.Start();
try
{
SqlDataReader rdr = sqlcmd1.ExecuteReader();
while (rdr.Read())
{
textUniqueWordCount = rdr.GetInt32(2);
if (textUniqueWordCount > maxTextUniqueWordCount)
{
System.Diagnostics.Debug.WriteLine("");
System.Diagnostics.Debug.WriteLine(maxTextUniqueWordCount.ToString() + " " + ccount.ToString()
+ " " + docsA.Count.ToString() + " " + docsB.Count.ToString() + " " + docsC.Count.ToString());
maxTextUniqueWordCount = maxTextUniqueWordCount * 2;
ccount = 0;
sw.Restart();
ScoreTwo(docs0, docsActive);
System.Diagnostics.Debug.WriteLine(" ScoreTwo(docs0, docsActive) ElapsedMilliseconds = " + sw.ElapsedMilliseconds.ToString("N0") + " ElapsedMinutes = " + (sw.ElapsedMilliseconds / 60000m).ToString("N2") + " ElapsedHours = " + (sw.ElapsedMilliseconds / 3600000m).ToString("N2"));
//System.Diagnostics.Debug.WriteLine("ElapsedMilliseconds = " + sw.ElapsedMilliseconds.ToString("N0"));
//System.Diagnostics.Debug.WriteLine("");
sw.Restart();
ScoreTwo(docs1, docsActive);
System.Diagnostics.Debug.WriteLine(" ScoreTwo(docs1, docsActive) ElapsedMilliseconds = " + sw.ElapsedMilliseconds.ToString("N0") + " ElapsedMinutes = " + (sw.ElapsedMilliseconds / 60000m).ToString("N2") + " ElapsedHours = " + (sw.ElapsedMilliseconds / 3600000m).ToString("N2"));
//System.Diagnostics.Debug.WriteLine("ElapsedMilliseconds = " + sw.ElapsedMilliseconds.ToString("N0"));
//System.Diagnostics.Debug.WriteLine("");
sw.Restart();
ScoreTwo(docsActive, docsActive);
System.Diagnostics.Debug.WriteLine(" ScoreTwo(docsActive, docsActive) ElapsedMilliseconds = " + sw.ElapsedMilliseconds.ToString("N0") + " ElapsedMinutes = " + (sw.ElapsedMilliseconds / 60000m).ToString("N2") + " ElapsedHours = " + (sw.ElapsedMilliseconds / 3600000m).ToString("N2"));
//System.Diagnostics.Debug.WriteLine("ElapsedMilliseconds = " + sw.ElapsedMilliseconds.ToString("N0"));
//System.Diagnostics.Debug.WriteLine("");
sw.Restart();
switch (textUniqueWordIter)
{
case 0:
Console.WriteLine("Case 0");
textUniqueWordIter = 1;
//docsB.Clear();
docs0 = docsC;
docs1 = docsA;
docsActive = docsB;
break;
case 1:
Console.WriteLine("Case 1");
textUniqueWordIter = 2;
//docsC.Clear();
docs0 = docsA;
docs1 = docsB;
docsActive = docsC;
break;
case 2:
Console.WriteLine("Case 2");
textUniqueWordIter = 0;
//docsA.Clear();
docs0 = docsA;
docs1 = docsC;
docsActive = docsC;
break;
default:
Console.WriteLine("Default case");
break;
}
docsActive.Clear();
}
sID = rdr.GetInt32(0);
if (doc.SID != sID)
{
if (doc.SID != 0 && doc.Words.Count > 0 && ccount < 100000) docsActive.Add(doc);
doc = new DocUniqueWords(sID);
ccount++;
}
doc.Words.Add(rdr.GetInt32(1));
}
rdr.Close();
if (doc.Words.Count > 0) docsActive.Add(doc);
System.Diagnostics.Debug.WriteLine("docs.Count = " + docsActive.Count.ToString("N0"));
System.Diagnostics.Debug.WriteLine("done");
}
catch (SqlException ex)
{
MessageBox.Show(ex.Message);
}
}
public void ScoreTwo(List<DocUniqueWords> docsOuter, List<DocUniqueWords> docsInner)
{ // docsInner is >= wordcount
System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
sw.Start();
foreach (DocUniqueWords d1 in docsOuter)
{
//if (d1.SID % 1000 == 0)
// System.Diagnostics.Debug.WriteLine(" d1.SID = " + d1.SID.ToString() + " ElapsedMilliseconds = " + sw.ElapsedMilliseconds.ToString("N0"));
if (docsOuter == docsInner)
{
foreach (DocUniqueWords d2 in docsInner.Where(x => x.SID < d1.SID))
{
Int32 hashMatchCount = 0;
if(d1.Words.Count<= d2.Words.Count)
{
foreach (Int32 d1sID in d1.Words)
if (d2.Words.Contains(d1sID)) hashMatchCount++;
}
else
{
foreach (Int32 d2sID in d2.Words)
if (d1.Words.Contains(d2sID)) hashMatchCount++;
}
}
}
else
{
foreach (DocUniqueWords d2 in docsInner)
{
if (d1.Words.Count * 4 >= d2.Words.Count)
{
Int32 hashMatchCount = 0;
foreach (Int32 d1sID in d1.Words)
if (d2.Words.Contains(d1sID)) hashMatchCount++;
}
else
{
Int32 kkk = 0;
}
}
}
}
}
答案 0 :(得分:2)
你似乎认为交叉路口很慢,但事实并非如此。
对于百万件物品,可能是几毫秒。
我不能对你的外循环说同样的话,这会很慢:
foreach (Item item1 in Items)
{
HashSet<Int32> idS1 = item1.idS;
foreach(Item item2 in Items.Where(x => x.ID > item1.ID)
...
通过它的声音,O(n ^ 2)如果我是对的。这里的技巧是预先排序您的项目列表,这样您就不需要Where
条件。
答案 1 :(得分:1)
不是你期望的答案 我无法击败光标
select 'starting'
set nocount on
--
IF OBJECT_ID(N'tempdb..#UniqueWords', N'U') IS NOT NULL DROP TABLE #UniqueWords
CREATE TABLE #UniqueWords (wordID INT PRIMARY KEY CLUSTERED);
--
IF OBJECT_ID(N'tempdb..#docMatchScore', N'U') IS NOT NULL DROP TABLE #docMatchScore
CREATE TABLE #docMatchScore (sID1 INT, sID2 INT, matchCount INT, PRIMARY KEY CLUSTERED (sID1, sID2));
--
declare @sID int;
declare @wordCountBase int;
declare @wordCountComp int;
DECLARE score_cursor CURSOR FOR
SELECT [sID], count(*)
from [FTSindexWordOnce] with (nolock)
--WHERE [sID] < 10000
GROUP BY [sID]
ORDER BY [sID];
OPEN score_cursor
Select 'FETCH NEXT FROM predict_cursor'
FETCH NEXT FROM score_cursor
INTO @sID, @wordCountBase
Select 'starting cursor'
WHILE @@FETCH_STATUS = 0
BEGIN
truncate table #UniqueWords;
insert into #UniqueWords ([wordID])
select [wordID]
from [FTSindexWordOnce] with (nolock)
where [sID] = @sid
order by [wordID];
--insert into #docMatchScore (sID1, sID2, matchCount)
insert into [ftsIndexWordMatch] with (tablock) (sID1, sID2, matchCount)
select @sID, [FTSindexWordOnce].[sID], count(#UniqueWords.[wordID])
from [FTSindexWordOnce] with (nolock)
join [docSVsys] with (nolock)
on [FTSindexWordOnce].[sID] > @sID
and [docSVsys].[sID] = [FTSindexWordOnce].[sID]
and [docSVsys].[textUniqueWordCount] * 4 >= @wordCountBase
and [docSVsys].[textUniqueWordCount] <= @wordCountBase * 4
left join #UniqueWords
on #UniqueWords.[wordID] = [FTSindexWordOnce].[wordID]
left join [ftsIndexWordMatch]
on [ftsIndexWordMatch].[sID1] = @sID
and [ftsIndexWordMatch].[sID2] = [FTSindexWordOnce].[sID]
where [ftsIndexWordMatch].[sID1] is null
and [ftsIndexWordMatch].[sID2] is null
--and [FTSindexWordOnce].[sID] < 1000
group by [FTSindexWordOnce].[sID]
FETCH NEXT FROM score_cursor
INTO @sID, @wordCountBase
END
CLOSE score_cursor;
DEALLOCATE score_cursor;
--
select * from #docMatchScore
IF OBJECT_ID(N'tempdb..#UniqueWords', N'U') IS NOT NULL DROP TABLE #UniqueWords
IF OBJECT_ID(N'tempdb..#docMatchScore', N'U') IS NOT NULL DROP TABLE #docMatchScore
Select 'done'
答案 2 :(得分:1)