我使用map reduce创建了一个索引来查找重复项,并且我成功找到了重复项。这是我的班级
class Employee {
public string Name {get; set;}
public string address {get; set;}
public string email {get; set;}
}
使用map reduce,我的结果是这样的
Sal abc salo@yahoo.co 2
此处,数据表明有2个重复文件。现在我的问题是我必须删除其中一个文件。显然,通过这种方式,我无法获取文档ID,因为我无法按文档ID进行分组。那么在找到重复项以删除其中一个文档后,我该怎么办?
以下是map reduce
的代码namespace DuplicateEmployee.Storage.Indexes
{
public class DupEmployeeIndex : AbstractIndexCreationTask<Employee, DupEmployeeIndex.UniqueEmployeeResult>
{
public class UniqueEmployeeResult
{
public string LastName { get; set; }
public string FirstName { get; set; }
public string address1 { get; set; }
public string address2 { get; set; }
public string city { get; set; }
public string state { get; set; }
public string zip { get; set; }
public string DOB { get; set; }
public int Count { get; set; }
}
public DupEmployeeIndex()
{
Map = employees => from employee in employees
select
new
{
LastName = employee.LastName,
FirstName = employee.FirstName,
address1 = employee.Address1,
address2 = employee.Address2,
city = employee.City,
state = employee.State,
zip = employee.Zip,
DOB = employee.DateOfBirth,
Count = 1
};
Reduce = results => from result in results
group result by new { result.LastName,
result.FirstName,
result.address1,
result.address2,
result.city,
result.state,
result.zip,
result.DOB
}
into g
select new
{
LastName = g.Key.LastName,
FirstName = g.Key.FirstName,
address1 = g.Key.address1,
address2 = g.Key.address2,
city = g.Key.city,
state = g.Key.state,
zip = g.Key.zip,
DOB = g.Key.DOB,
Count = g.Sum(x => x.Count)
};
}
}
}