在Objective-C或Swift中的Jaro Winkler距离

时间:2019-02-26 16:38:55

标签: ios objective-c nsstring jaro-winkler

我需要对大量字符串进行模糊比较,并查看Jaro-Winkler,该字母尊重字母顺序上的差异。是否有人知道使用Jaro-Winkler或IOS固有的某种方法在Objective-C或Swift中执行此操作的方法?

感谢您的任何建议。

1 个答案:

答案 0 :(得分:0)

我在Apache Commons中得到了启发,并将其重写为Swift:

extension String {
    static func jaroWinglerDistance(_ first: String, _ second: String) -> Double {
        let longer = Array(first.count > second.count ? first : second)
        let shorter = Array(first.count > second.count ? second : first)

        let (numMatches, numTranspositions) = jaroWinklerData(longer: longer, shorter: shorter)

        if numMatches == 0 {
            return 0
        }

        let defaultScalingFactor = 0.1;
        let percentageRoundValue = 100.0;

        let jaro = [
            numMatches / Double(first.count),
            numMatches / Double(second.count),
            (numMatches - numTranspositions) / numMatches
        ].reduce(0, +) / 3

        let jaroWinkler: Double

        if jaro < 0.7 {
            jaroWinkler = jaro
        } else {
            let commonPrefixLength = Double(commonPrefix(first, second).count)
            jaroWinkler = jaro + Swift.min(defaultScalingFactor, 1 / Double(longer.count)) * commonPrefixLength * (1 - jaro)
        }

        return round(jaroWinkler * percentageRoundValue) / percentageRoundValue
    }

    private static func commonPrefix(_ first: String, _ second: String) -> String{
        return String(
            zip(first, second)
                .prefix { $0.0 == $0.1 }
                .map { $0.0 }
        )
    }

    private static func jaroWinklerData(
        longer: Array<Character>,
        shorter: Array<Character>
    ) -> (numMatches: Double, numTranspositions: Double) {
        let window = Swift.max(longer.count / 2 - 1, 0)

        var shorterMatchedChars: [Character] = []
        var longerMatches = Array<Bool>(repeating: false, count: longer.count)

        for (offset, shorterChar) in shorter.enumerated() {
            let windowRange = Swift.max(offset - window, 0) ..< Swift.min(offset + window + 1, longer.count)
            if let matchOffset = windowRange.first(where: { !longerMatches[$0] && shorterChar == longer[$0] }) {
                shorterMatchedChars.append(shorterChar)
                longerMatches[matchOffset] = true
            }
        }

        let longerMatchedChars = longerMatches
            .enumerated()
            .filter { $0.element }
            .map { longer[$0.offset] }

        let numTranspositions: Int = zip(shorterMatchedChars, longerMatchedChars)
            .lazy
            .filter { $0.0 != $0.1 }
            .count / 2

        return (
            numMatches: Double(shorterMatchedChars.count),
            numTranspositions: Double(numTranspositions)
        )
    }
}

通过原始代码中的示例进行了测试:

print(String.jaroWinglerDistance("", ""))
print(String.jaroWinglerDistance("", "a"))
print(String.jaroWinglerDistance("aaapppp", ""))
print(String.jaroWinglerDistance("frog", "fog"))
print(String.jaroWinglerDistance("fly", "ant"))
print(String.jaroWinglerDistance("elephant", "hippo"))
print(String.jaroWinglerDistance("hippo", "elephant"))
print(String.jaroWinglerDistance("hippo", "zzzzzzzz"))
print(String.jaroWinglerDistance("hello", "hallo"))
print(String.jaroWinglerDistance("ABC Corporation", "ABC Corp"))
print(String.jaroWinglerDistance("D N H Enterprises Inc", "D & H Enterprises, Inc."))
print(String.jaroWinglerDistance("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"))
print(String.jaroWinglerDistance("PENNSYLVANIA", "PENNCISYLVNIA"))

我还在github中找到了String相似度函数的另一种实现。