我目前正在使用以下正则表达式和代码来解析html文档中的电子邮件地址
string pattern = @"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*";
Regex regex = new Regex(
pattern,
RegexOptions.None | RegexOptions.Compiled);
MatchCollection matches = regex.Matches(input); // Here is where it takes time
MessageBox.Show(matches.Count.ToString());
foreach (Match match in matches)
{
...
}
例如:
尝试解析http://www.amelia.se/Pages/Amelia-search-result-page/?q=
在RegexHero上,它崩溃了。
有没有办法优化这个?
答案 0 :(得分:1)
要详细说明@Joey的建议,我会主张逐行输入,删除任何不包含@
的行,并将你的正则表达式应用于那个。这应该会大大减轻负荷。
private List<Match> find_emails_matches()
{
List<Match> result = new List<Match>();
using (FileStream stream = new FileStream(@"C:\tmp\test.txt", FileMode.Open, FileAccess.Read))
{
using(StreamReader reader = new StreamReader(stream))
{
string pattern = @"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*";
Regex regex = new Regex(pattern, RegexOptions.None | RegexOptions.Compiled);
string line;
while((line = reader.ReadLine()) != null)
{
if (line.Contains('@'))
{
MatchCollection matches = regex.Matches(line); // Here is where it takes time
foreach(Match m in matches) result.Add(m);
}
}
}
}
return result;
}