对于与最可能的电子邮件地址匹配的算法,我在运行时方面遇到了一些麻烦。函数本身运行良好(因为它正确配对了名称和电子邮件地址),但运行时非常庞大,很难在大型数据集上实现。我是编码的初学者,很想听听你们提供的解决方案。
快速记录我在这里实施了Levenshtein的算法。如果有更高效的算法,请在下面发表评论!
from string import digits
import copy
import re
# levenshtein algorithm found on https://www.python-course.eu/levenshtein_distance.php
def call_counter(func):
def helper(*args, **kwargs):
helper.calls += 1
return func(*args, **kwargs)
helper.calls = 0
helper.__name__= func.__name__
return helper
def memoize(func):
mem = {}
def memoizer(*args, **kwargs):
key = str(args) + str(kwargs)
if key not in mem:
mem[key] = func(*args, **kwargs)
return mem[key]
return memoizer
@call_counter
@memoize
def levenshtein(s, t):
if s == "":
return len(t)
if t == "":
return len(s)
if s[-1] == t[-1]:
cost = 0
else:
cost = 1
res = min([levenshtein(s[:-1], t)+1,
levenshtein(s, t[:-1])+1,
levenshtein(s[:-1], t[:-1]) + cost])
return res
def emailmatch(emails_file,name_file):
name_email_match = {} #store the matching emails in a dictionary
with open(name_file, 'r') as names:
match_name = 0
for individual in names:
with open(emails_file,'r') as address_emails:
first_name = individual[:(individual.index(" "))].lower()
last_name = individual[(individual.rindex(" ")):].lower()
full_name = (first_name + last_name).lower()
full_name_period = (first_name+"."+last_name).lower()
best_match = "" #this holds the best matching email
minimum = 999
for emails in address_emails:
email = emails[0:(emails.index('@'))]
temp = min(levenshtein(last_name,email),
levenshtein(first_name,email),
levenshtein(full_name,email),
levenshtein(full_name_period,email))
if (temp < minimum):
minimum = temp
best_match = emails
name_email_match[individual] = best_match
return name_email_match
emailmatch('emails.txt', 'names.txt')