Soundex函数最初是为英语实现的,但是由于土耳其语有很多特殊字符,我找不到适合的解决方案,这里有一个很好的例子,英语为Soundex implementation in python >
答案 0 :(得分:0)
基于上面的链接描述,我创建了以下2个python函数来处理土耳其语的soundex。希望它也会对您有所帮助。
def tr_lower(text):
text = re.sub(r"İ", "i", text)
text = re.sub(r"I", "ı", text)
text = re.sub(r"Ç", "ç", text)
text = re.sub(r"Ş", "ş", text)
text = re.sub(r"Ü", "ü", text)
text = re.sub(r"Ğ", "ğ", text)
text = text.lower()
return text
def soundex(query: str):
"""
https://en.wikipedia.org/wiki/Soundex
:param query:
:return:
"""
# lower the word with taking care of Turkish characters
query = tr_lower(query)
# replace first character of word if its Turkish, if not done it its not possible to match words like şule & sule
query = re.sub(r"^ı", "i", query)
query = re.sub(r"^ç", "c", query)
query = re.sub(r"^ş", "s", query)
query = re.sub(r"^ü", "u", query)
query = re.sub(r"^ğ", "g", query)
# Step 0: Clean up the query string
query = query.lower()
letters = [char for char in query if char.isalpha()]
# Step 1: Save the first letter. Remove all occurrences of a, e, i, o, u, y, h, w.
# If query contains only 1 letter, return query+"000" (Refer step 5)
if len(query) == 1:
return query + "000"
to_remove = ('a', 'e', 'i', 'ı', 'o', 'ö', 'u', 'ü', 'y', 'h', 'w')
first_letter = letters[0]
letters = letters[1:]
letters = [char for char in letters if char not in to_remove]
if len(letters) == 0:
return first_letter + "000"
# Step 2: Replace all consonants (include the first letter) with digits according to rules
to_replace = {
('b', 'f', 'p', 'v'): 1,
('c', 'ç', 'g', 'ğ', 'j', 'k', 'q', 's', 'ş', 'x', 'z'): 2,
('d', 't'): 3,
('l',): 4,
('m', 'n'): 5,
('r',): 6
}
first_letter = [value if first_letter else first_letter for group, value in to_replace.items()
if first_letter in group]
letters = [value if char else char
for char in letters
for group, value in to_replace.items()
if char in group]
# Step 3: Replace all adjacent same digits with one digit.
letters = [char for ind, char in enumerate(letters)
if (ind == len(letters) - 1 or (ind+1 < len(letters) and char != letters[ind+1]))]
# Step 4: If the saved letter’s digit is the same the resulting first digit, remove the digit (keep the letter)
if first_letter == letters[0]:
letters[0] = query[0]
else:
letters.insert(0, query[0])
# Step 5: Append 3 zeros if result contains less than 3 digits.
# Remove all except first letter and 3 digits after it.
first_letter = letters[0]
letters = letters[1:]
letters = [char for char in letters if isinstance(char, int)][0:3]
while len(letters) < 3:
letters.append(0)
letters.insert(0, first_letter)
_string = "".join([str(l) for l in letters])
return _string