我需要对大量字符串进行模糊比较,并查看Jaro-Winkler,该字母尊重字母顺序上的差异。是否有人知道使用Jaro-Winkler或IOS固有的某种方法在Objective-C或Swift中执行此操作的方法?
感谢您的任何建议。
答案 0 :(得分:0)
我在Apache Commons中得到了启发,并将其重写为Swift:
extension String {
static func jaroWinglerDistance(_ first: String, _ second: String) -> Double {
let longer = Array(first.count > second.count ? first : second)
let shorter = Array(first.count > second.count ? second : first)
let (numMatches, numTranspositions) = jaroWinklerData(longer: longer, shorter: shorter)
if numMatches == 0 {
return 0
}
let defaultScalingFactor = 0.1;
let percentageRoundValue = 100.0;
let jaro = [
numMatches / Double(first.count),
numMatches / Double(second.count),
(numMatches - numTranspositions) / numMatches
].reduce(0, +) / 3
let jaroWinkler: Double
if jaro < 0.7 {
jaroWinkler = jaro
} else {
let commonPrefixLength = Double(commonPrefix(first, second).count)
jaroWinkler = jaro + Swift.min(defaultScalingFactor, 1 / Double(longer.count)) * commonPrefixLength * (1 - jaro)
}
return round(jaroWinkler * percentageRoundValue) / percentageRoundValue
}
private static func commonPrefix(_ first: String, _ second: String) -> String{
return String(
zip(first, second)
.prefix { $0.0 == $0.1 }
.map { $0.0 }
)
}
private static func jaroWinklerData(
longer: Array<Character>,
shorter: Array<Character>
) -> (numMatches: Double, numTranspositions: Double) {
let window = Swift.max(longer.count / 2 - 1, 0)
var shorterMatchedChars: [Character] = []
var longerMatches = Array<Bool>(repeating: false, count: longer.count)
for (offset, shorterChar) in shorter.enumerated() {
let windowRange = Swift.max(offset - window, 0) ..< Swift.min(offset + window + 1, longer.count)
if let matchOffset = windowRange.first(where: { !longerMatches[$0] && shorterChar == longer[$0] }) {
shorterMatchedChars.append(shorterChar)
longerMatches[matchOffset] = true
}
}
let longerMatchedChars = longerMatches
.enumerated()
.filter { $0.element }
.map { longer[$0.offset] }
let numTranspositions: Int = zip(shorterMatchedChars, longerMatchedChars)
.lazy
.filter { $0.0 != $0.1 }
.count / 2
return (
numMatches: Double(shorterMatchedChars.count),
numTranspositions: Double(numTranspositions)
)
}
}
通过原始代码中的示例进行了测试:
print(String.jaroWinglerDistance("", ""))
print(String.jaroWinglerDistance("", "a"))
print(String.jaroWinglerDistance("aaapppp", ""))
print(String.jaroWinglerDistance("frog", "fog"))
print(String.jaroWinglerDistance("fly", "ant"))
print(String.jaroWinglerDistance("elephant", "hippo"))
print(String.jaroWinglerDistance("hippo", "elephant"))
print(String.jaroWinglerDistance("hippo", "zzzzzzzz"))
print(String.jaroWinglerDistance("hello", "hallo"))
print(String.jaroWinglerDistance("ABC Corporation", "ABC Corp"))
print(String.jaroWinglerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc."))
print(String.jaroWinglerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"))
print(String.jaroWinglerDistance("PENNSYLVANIA", "PENNCISYLVNIA"))
我还在github中找到了String相似度函数的另一种实现。