我需要将电子邮件发送与电子邮件退回匹配,以便我可以找到它们是否已发送。问题是,我必须将反弹限制在发送后的4天内,以消除匹配错误的发送到反弹。发送记录的期限为30天。
LinkedList<event_data> sent = GetMyHugeListOfSends(); //for example 1M+ records
List<event_data> bounced = GetMyListOfBounces(); //for example 150k records
bounced = bounced.OrderBy(o => o.event_date).ToList(); //this ensures the most accurate match of bounce to send (since we find the first match)
List<event_data> delivered = new List<event_data>();
event_data deliveredEmail = new event_data();
foreach (event_data sentEmail in sent)
{
event_data bounce = bounced.Find(item => item.email.ToLower() == sentEmail.email.ToLower() && (item.event_date > sentEmail.event_date && item.event_date < sentEmail.event_date.AddDays(deliveredCalcDelayDays)));
//create delivered records
if (bounce != null)
{
//there was a bounce! don't add a delivered record!
}
else
{
//if sent is not bounced, it's delivered
deliveredEmail.sid = siteid;
deliveredEmail.mlid = mlid;
deliveredEmail.mid = mid;
deliveredEmail.email = sentEmail.email;
deliveredEmail.event_date = sentEmail.event_date;
deliveredEmail.event_status = "Delivered";
deliveredEmail.event_type = "Delivered";
deliveredEmail.id = sentEmail.id;
deliveredEmail.number = sentEmail.number;
deliveredEmail.laststoretransaction = sentEmail.laststoretransaction;
delivered.Add(deliveredEmail); //add the new delivered
deliveredEmail = new event_data();
//remove bounce, it only applies to one send!
bounced.Remove(bounce);
}
if (bounced.Count() == 0)
{
break; //no more bounces to match!
}
}
所以我做了一些测试,它每秒处理大约12个已发送的记录。在1M +记录中,处理需要25个小时!
两个问题:
谢谢!
修改
--- ---思想
答案 0 :(得分:4)
我有理由相信这是你的发现,这是花时间。
看起来您确定find方法只返回0或1个记录(不是列表),在这种情况下,加快速度的方法是创建查找(字典)而不是创建{{ 1}}对于你的退回var,改为创建List<event_data>
,然后你可以按键查找值而不是查找。
诀窍是创建你的密钥(我对你的应用程序不太了解以帮助解决这个问题),但基本上与你的查找中的标准相同。
EDIT。 (添加一些伪代码)
Dictionary<key, event_data>
答案 1 :(得分:1)
您应该使用ToLookup
方法为电子邮件地址创建查找表
var bouncedLookup = bounced.ToLookup(k => k.email.ToLower());
并在循环中使用它来首先通过电子邮件查找
var filteredBounced = bouncedLookup[sent_email.email.ToLower()];
// mini optimisation here
var endDate = sentEmail.event_date.AddDays(deliveredCalcDelayDays);
event_data bounce = filteredBounced.Find(item => item.event_date > sentEmail.event_date && item.event_date < endDate));
我无法编译它,但我认为应该这样做。请试一试。
答案 2 :(得分:0)
将退回转换为sortedlist可能是一个很好的解决方案
SortedList<string,data> sl = new SortedList<string,event_data>(bounced.ToDictionary(s=>s.email,s=>s));
and to find a bounce use
sl.Select(c=>c.Key.Equals(item => item.email,StringComparison.OrdinalIgnoreCase) && ...).FirstOrDefault();
答案 3 :(得分:0)
您正在列表中查找项目。这意味着它必须遍历整个列表,因此它是一个order(n)操作。您是否可以将这些已发送的电子邮件存储在字典中,其中密钥是您要搜索的电子邮件地址。通过反弹链接回到字典中的电子邮件。查找将是恒定时间,您将通过反弹,因此整体将是order(n)。您当前的方法是顺序(n平方)
答案 4 :(得分:0)
在考虑时,反弹次数相对较小,因此,
为什么不尽可能地预先选择退回查找,此代码会为每次可能的退回设置委托,并将它们分组到字典中以便通过电子邮件密钥进行访问。
private static DateInRange(
DateTime sendDate,
DateTime bouncedDate,
int deliveredCalcDelayDays)
{
if (sentDate < bouncedDate)
{
return false;
}
return sentDate < bouncedDate.AddDays(deliveredCalcDelayDays);
}
static IEnumerable<event_data> GetDeliveredMails(
IEnumerable<event_data> sent,
IEnumerable<event_data> bounced,
int siteId,
int mlId,
int mId,
int deliveredCalcDelayDays)
{
var grouped = bounced.GroupBy(
b => b.email.ToLowerInvariant());
var lookup = grouped.ToDictionary(
g => g.Key,
g => g.OrderBy(e => e.event_date).Select(
e => new Func<DateTime, bool>(
s => DateInRange(s, e.event_date, deliveredCalcDelayDays))).ToList());
foreach (var s in sent)
{
var key = s.email.ToLowerInvariant();
List<Func<DateTime, nool>> checks;
if (lookup.TryGetValue(key, out checks))
{
var match = checks.FirstOrDefault(c => c(s.event_date));
if (match != null)
{
checks.Remove(match);
continue;
}
}
yield return new event_data
{
.sid = siteid;
.mlid = mlid;
.mid = mid;
.email = s.email;
.event_date = s.event_date;
.event_status = "Delivered";
.event_type = "Delivered";
.id = s.id;
.number = s.number;
.laststoretransaction = s.laststoretransaction
};
}
}
如果速度不够快,您可以尝试在查找中预编译代理。
答案 5 :(得分:0)
还有一个关于你的代码的问题,我想指出。
内存消耗。我不知道你的机器配置,但这里有一些关于代码的想法:
event_data
的1,2M +对象分配空间
类型。我看不到event_data
完整类型定义,但假设
电子邮件都是独特的,并且看到该类型有很多
属性,我可以假设这样的集合相当重
(可能数百兆)。event_data
个对象
(如果我把它算得正确的话几乎是1M)。甚至更重
在内存消耗方面bounced.Remove(bounce);
并且它确实会显着减慢您的应用。所以,即使你有足够的内存和/或你的应用程序是64位,我会尽量减少内存消耗。很确定它会让你的代码运行得更快。例如,您可以对deliveredEmail
进行完整处理,而无需将其存储,或以块的形式加载初始event_data
等。
答案 6 :(得分:0)
好的,我找到的最终解决方案是弹跳字典。
发送的LinkedList按sent_date排序,因此它将按时间顺序循环。这很重要,因为我必须将正确的发送与正确的反弹相匹配。
我创建了Dictionary<string,<List<event_data>>
,因此密钥是电子邮件,其值是电子邮件地址的所有<event_data>
跳出的列表。列表按event_date排序,因为我想确保第一次退回与发送匹配。
最终结果......它从处理700记录/分钟变为500k +记录/秒。
以下是最终代码:
LinkedList sent = GetMyHugeListOfSends(); IEnumerable sentOrdered = sent.OrderBy(send =&gt; send.event_date);
词典&GT; bounced = GetMyListOfBouncesAsDictionary();
List deliver = new List(); event_data deliveredEmail = new event_data();
List bounces = null; bool matchedBounce = false;
foreach(eventOddered中的event_data sentEmail) { matchedBounce = false;
//create delivered records
if (bounced.TryGetValue(sentEmail.email, out bounces))
{
//there was a bounce! find out if it was within 4 days after the send!
foreach (event_data bounce in bounces)
{
if (bounce.event_date > sentEmail.event_date &&
bounce.event_date <= sentEmail.event_date.AddDays(4))
{
matchedBounce = true;
//remove the record because a bounce can only match once back to a send
bounces.Remove(bounce);
if(bounces.Count == 0) //no more bounces for this email
{
bounced.Remove(sentEmail.email);
}
break;
}
}
if (matchedBounce == false) //no matching bounces in the list!
{
//if sent is not bounced, it's delivered
deliveredEmail.sid = siteid;
deliveredEmail.mlid = mlid;
deliveredEmail.mid = mid;
deliveredEmail.email = sentEmail.email;
deliveredEmail.event_date = sentEmail.event_date;
deliveredEmail.event_status = "Delivered";
deliveredEmail.event_type = "Delivered";
deliveredEmail.id = sentEmail.id;
deliveredEmail.number = sentEmail.number;
deliveredEmail.laststoretransaction = sentEmail.laststoretransaction;
delivered.Add(deliveredEmail); //add the new delivered
deliveredEmail = new event_data();
}
}
else
{
//if sent is not bounced, it's delivered
deliveredEmail.sid = siteid;
deliveredEmail.mlid = mlid;
deliveredEmail.mid = mid;
deliveredEmail.email = sentEmail.email;
deliveredEmail.event_date = sentEmail.event_date;
deliveredEmail.event_status = "Delivered";
deliveredEmail.event_type = "Delivered";
deliveredEmail.id = sentEmail.id;
deliveredEmail.number = sentEmail.number;
deliveredEmail.laststoretransaction = sentEmail.laststoretransaction;
delivered.Add(deliveredEmail); //add the new delivered
deliveredEmail = new event_data();
}
if (bounced.Count() == 0)
{
break; //no more bounces to match!
}
}