我有CSV格式的公司及其ID的列表。但是有很多重复或可能重复,即部分匹配。范例-Google India Pvt Ltd VS Google Ind Pvt Ltd。
我已经在python中为模糊匹配创建了一个代码,该代码运行良好,但是我无法从列表中排除最常见的单词。以及如何在python中使用V-lookUp?
import pandas as pd
pd.set_option('display.max_columns', 100000)
df = pd.read_csv("Sample DDB Data.csv", encoding='cp1252')
df.head()
df.columns
from collections import Counter
all_names = df['COMP_NAME'].unique()
names_freq = Counter()
for name in all_names:
names_freq.update(str(name).split(" "))
key_words = [word for (word,_) in names_freq.most_common(30)]
print(key_words)
len(all_names)
all_main_name = pd.DataFrame(columns=['sort_gp','COMP_ID','names','Duplicate','Duplicate_COMP_ID','score'])
all_names.sort()
all_main_name['names'] = all_names
all_main_name['sort_gp'] = all_main_name['names'].apply(lambda x: x[0])
from fuzzywuzzy import fuzz
all_sort_gp = all_main_name['sort_gp'].unique()
def no_key_word(name):
"""check if the name contain the keywords in company"""
output = True
for key in key_words:
if key in name:
output = False
return output
for sortgp in all_sort_gp:
this_gp = all_main_name.groupby(['sort_gp']).get_group(sortgp)
gp_start = this_gp.index.min()
gp_end = this_gp.index.max()
for i in range(gp_start, gp_end + 1):
# if self has not got Duplicate, asign to be Duplicate of itself
if pd.isna(all_main_name['Duplicate'].iloc[i]):
all_main_name['Duplicate'].iloc[i] = all_main_name['names'].iloc[i]
all_main_name['score'].iloc[i] = 100
# if the following has not got Duplicate and fuzzy match, asign to be Duplicate of this one
for j in range(i + 1, gp_end + 1):
if pd.isna(all_main_name['Duplicate'].iloc[j]):
fuzz_socre = fuzz.token_sort_ratio(all_main_name['names'].iloc[i], all_main_name['names'].iloc[j])
if not no_key_word(all_main_name['names'].iloc[j]):
fuzz_socre -= 10
if (fuzz_socre > 85):
all_main_name['Duplicate'].iloc[j] = all_main_name['Duplicate'].iloc[i]
all_main_name['score'].iloc[j] = fuzz_socre
if i % (len(all_names) // 10) == 0:
print("progress: %.2f" % (100 * i / len(all_names)) + "%")
all_main_name.to_csv('Fuzzy Sample DDB.csv')