Question

我的任务是比较数据库中两个表中的数据以获得相似性，例如，如果每个表有5条记录，那么我需要将表A中的每条记录与表B中的所有记录进行比较，以获得相似性。在我使用单线程之前，如果每个表有500条记录，并且它使用4分钟，现在我使用4个线程，它使用半个小时！这是我的想法，我将第一个表分成4个表，每个表存储部分数据，然后在线程池中使用4个线程开始比较，这是代码，p1，p2是表

Deduplication d = new Deduplication(pr2, threshold);

Func<List<ParentRecord>, List<ParentRecord>> method = d.Find;

for (int i = 0; i < 4; i++)
{
    IEnumerable<ParentRecord> temp = pr1.Skip(i*part).Take(part);
    method.BeginInvoke(temp.ToList(), CallBackMethod, method);
}

private void CallBackMethod(IAsyncResult result)
{
    countThread++;

    var target = (Func<List<ParentRecord>, List<ParentRecord>>)result.AsyncState;
    List<ParentRecord> p=target.EndInvoke(result);
    lock (_locker)
    {
        records.AddRange(p);
    }
    if (countThread > 3)
    {
        this.BeginInvoke(new PopulateDelegate(PopulateGridView), new object[] { records });
    }
}

private void PopulateGridView(List<ParentRecord> p)
{ 
    dataGridViewParent.DataSource = p;
    dataGridViewDuplication.DataSource = null;
}

抱歉，我是多线程的新手，所以这个想法可能听起来有点愚蠢，如果你能说清楚，我将非常感谢，谢谢。

更新

public List<ParentRecord> Find()
    {
        List<ParentRecord> result = new List<ParentRecord>();

        foreach (ParentRecord p1 in DataSource1)
        {
   List<DuplicateRecord> addedDuplicateRecords = new List<DuplicateRecord>();
            int num = 0;
            foreach (ParentRecord p2 in DataSource2)
            {

                //Check if these two rows have the same primary keys
                if (p1.PrimaryKey != p2.PrimaryKey)
                {
                    float similarity = 0F;
                    //Check if these two rows are the simply the same
                if (p1.CompareRow.ToUpper() == p2.CompareRow.ToUpper()) similarity = 1;
                    else similarity = GetSimilarity(p1.CompareRow, p2.CompareRow);
                    if (similarity >= threshold)
                    {
                        DuplicateRecord duplicateRecord = new DuplicateRecord();
                        duplicateRecord.PrimaryKey = p2.PrimaryKey;
                        duplicateRecord.CompareToRow = p2.CompareRow;
                        duplicateRecord.Similarity = similarity;
                        addedDuplicateRecords.Add(duplicateRecord);
                        num++;
                    }
                }
            }
            //Check if there are any reocrds meet the threadhold
            if (num > 0)
            {
                ParentRecord parentRecord = new ParentRecord();
                parentRecord.PrimaryKey = p1.PrimaryKey;
                parentRecord.CompareRow = p1.CompareRow;
                parentRecord.duplicateRecordList = addedDuplicateRecords;
                result.Add(parentRecord);
            } 
        }
        return result; 
    }

    private float GetSimilarity(object obj1, object obj2)
    {
        float similarity = 1;


        MatchsMaker match = new MatchsMaker(obj1.ToString(), obj2.ToString());

        similarity = match.Score;


        return similarity;
    }

}

Answer 1

很难理解这是做什么的。我会尝试通过使用精心设计的查询或将所有数据读入内存而不是尝试从多个线程中逐步执行查询来完全解决问题。

Answer 2

我猜你要么在业务层死锁，要么在数据库中获取表级锁。

脱离我的头脑，替代方案：

两个DataReaders被并排阅读
数据库游标（这可能是合法用途）
根据一系列INNER JOIN s
CTE（假设您使用的是SQL Server）

如果您只运行500条记录的性能测试，我无法想象这些方法中的任何一种都需要花费几分钟的时间。

使用多线程程序变得慢得多

2 个答案: