以下是我正在尝试的代码,但我没有得到预期的结果。
import re
def multiwordReplace(text, wordDic):
"""
take a text and replace words that match a key in a dictionary with
the associated value, return the changed text
"""
rc = re.compile('|'.join(map(re.escape, wordDic)))
def translate(match):
return wordDic[match.group(0)]
return rc.sub(translate, text)
wordDic = {
'ANGLO': 'ANGLO IRISH BANK',
'ANGLO IRISH': 'ANGLO IRISH BANK'
}
def replace(match):
return wordDic[match.group(0)]
#return ''.join(y for y in match.group(0).split())
str1 = {'ANGLO IRISH CORP PLC - THIS FOLLOWS THE BANK NATIONALIZATION BY THE GOVT OF THE REPUBLIC OF IRELAND'
'ANGLO CORP PLC - THIS FOLLOWS THE BANKS NATIONALIZATION BY THE GOVT OF THE REPUBLIC OF IRELAND'}
for item in str1:
str2 = multiwordReplace(item, wordDic)
print str2
print re.sub('|'.join(r'\b%s\b' % re.escape(s) for s in wordDic),
replace, item)
输出:
ANGLO IRISH BANK IRISH CORP PLC - 这是爱尔兰共和国政府对银行国有化的追随 ANGLO IRISH BANK CORP PLC - 这是爱尔兰共和国政府对银行国有化的看法
第一个必须只给'ANGLO IRISH BANK'而不是ANGLO IRISH BANK IRISH。
答案 0 :(得分:1)
排序,以便首先显示最长的匹配。
longest_first = sorted(wordDic, key=len, reverse=True)
rc = re.compile('|'.join(map(re.escape, longest_first)))