从字符串优化中提取链接

时间:2015-04-07 16:47:20

标签: ios string swift

我从网站获取数据(HTML字符串)。我想提取所有链接。我写函数(它有效),但它太慢了......

你能帮我优化一下吗?我可以使用哪些标准功能? 功能逻辑:找到" http:.//" sting in text,然后读取字符串(买char)直到我不会得到" \""。

extension String {

subscript (i: Int) -> Character {
    return self[advance(self.startIndex, i)]
}

subscript (i: Int) -> String {
    return String(self[i] as Character)
}

subscript (r: Range<Int>) -> String {
    return substringWithRange(Range(start: advance(startIndex, r.startIndex), end: advance(startIndex, r.endIndex)))
}}



func extractAllLinks(text:String) -> Array<String>{
var stringArray = Array<String>()
var find = "http://" as String

for (var i = countElements(find); i<countElements(text); i++)
{
    var ch:Character = text[i - Int(countElements(find))]
    if (ch == find[0])
    {
        var j = 0
        while (ch == find[j])
        {
            var ch2:Character = find[j]
            if(countElements(find)-1 == j)
            {
                break
            }
            j++
            i++
            ch = text[i - Int(countElements(find))]
        }

        i -= j
        if (j == (countElements(find)-1))
        {
            var str = ""
            for (; text[i - Int(countElements(find))] != "\""; i++)
            {
                str += text[i - Int(countElements(find))]
            }
            stringArray.append(str)
        }

    }
}
return stringArray}

7 个答案:

答案 0 :(得分:22)

就像使用NSDataDetector上面说的AdamPro13一样,您可以轻松获取所有网址,请参阅以下代码:

let text = "http://www.google.com. http://www.bla.com"
let types: NSTextCheckingType = .Link
var error : NSError?

let detector = NSDataDetector(types: types.rawValue, error: &error)        
var matches = detector!.matchesInString(text, options: nil, range: NSMakeRange(0, count(text)))

for match in matches {
   println(match.URL!)
}

输出:

http://www.google.com
http://www.bla.com
  

已更新至Swift 2.0

let text = "http://www.google.com. http://www.bla.com"
let types: NSTextCheckingType = .Link

let detector = try? NSDataDetector(types: types.rawValue)

guard let detect = detector else {
   return
}

let matches = detect.matchesInString(text, options: .ReportCompletion, range: NSMakeRange(0, text.characters.count))

for match in matches {
    print(match.URL!)
}

请记住在上面的情况下使用guard语句它必须在函数或循环中。

我希望这有帮助。

答案 1 :(得分:7)

这就是 Swift 4.0

的答案
let text = "http://www.google.com. http://www.bla.com"
let types: NSTextCheckingResult.CheckingType = .link

let detector = try? NSDataDetector(types: types.rawValue)

guard let detect = detector else {
    return
}

let matches = detect.matches(in: content, options: .reportCompletion, range: NSMakeRange(0, content.count))

for match in matches {
    print(match.url!)
}

答案 2 :(得分:4)

非常有帮助的帖子!这是根据Victor Sigler答案在Swift 1.2中运行的一个示例。

    // extract first link (if available) and open it!
    let text = "How technology is changing our relationships to each other: http://t.ted.com/mzRtRfX"
    let types: NSTextCheckingType = .Link

    do {
        let detector = try NSDataDetector(types: types.rawValue)
        let matches = detector.matchesInString(text, options: .ReportCompletion, range: NSMakeRange(0, text.characters.count))
        if matches.count > 0 {
            let url = matches[0].URL!
            print("Opening URL: \(url)")
            UIApplication.sharedApplication().openURL(url)
        }

    } catch {
        // none found or some other issue
        print ("error in findAndOpenURL detector")
    }

答案 3 :(得分:3)

详细信息

  • Swift 5.2,Xcode 11.4(11E146)

解决方案

// MARK: DataDetector

class DataDetector {

    private class func _find(all type: NSTextCheckingResult.CheckingType,
                             in string: String, iterationClosure: (String) -> Bool) {
        guard let detector = try? NSDataDetector(types: type.rawValue) else { return }
        let range = NSRange(string.startIndex ..< string.endIndex, in: string)
        let matches = detector.matches(in: string, options: [], range: range)
        loop: for match in matches {
            for i in 0 ..< match.numberOfRanges {
                let nsrange = match.range(at: i)
                let startIndex = string.index(string.startIndex, offsetBy: nsrange.lowerBound)
                let endIndex = string.index(string.startIndex, offsetBy: nsrange.upperBound)
                let range = startIndex..<endIndex
                guard iterationClosure(String(string[range])) else { break loop }
            }
        }
    }

    class func find(all type: NSTextCheckingResult.CheckingType, in string: String) -> [String] {
        var results = [String]()
        _find(all: type, in: string) {
            results.append($0)
            return true
        }
        return results
    }

    class func first(type: NSTextCheckingResult.CheckingType, in string: String) -> String? {
        var result: String?
        _find(all: type, in: string) {
            result = $0
            return false
        }
        return result
    }
}

// MARK: String extension

extension String {
    var detectedLinks: [String] { DataDetector.find(all: .link, in: self) }
    var detectedFirstLink: String? { DataDetector.first(type: .link, in: self) }
    var detectedURLs: [URL] { detectedLinks.compactMap { URL(string: $0) } }
    var detectedFirstURL: URL? {
        guard let urlString = detectedFirstLink else { return nil }
        return URL(string: urlString)
    }
}

用法

let text = """
Lorm Ipsum is simply dummy text of the printing and typesetting industry. apple.com/ Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. http://gooogle.com. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. yahoo.com It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
"""

print(text.detectedLinks)
print(text.detectedFirstLink)
print(text.detectedURLs)
print(text.detectedFirstURL)

控制台输出

["apple.com/", "http://gooogle.com", "yahoo.com"]
Optional("apple.com/")
[apple.com/, http://gooogle.com, yahoo.com]
Optional(apple.com/)

答案 4 :(得分:0)

实际上有一个名为NSDataDetector的班级会为你检测链接。

你可以在NSHipster上找到它的一个例子:http://nshipster.com/nsdatadetector/

答案 5 :(得分:0)

我想知道你是否意识到每次调用countElements时,都会调用一个主要的复杂函数,它必须扫描字符串中的所有Unicode字符,并从中提取扩展的字形集群并对它们进行计数。如果您不知道扩展的字形集群是什么,那么您应该能够想象这不是便宜且主要的过度杀伤力。

只需将其转换为NSString *,调用rangeOfString即可完成。

显然你做的事情完全不安全,因为http://并不意味着有链接。你不能只在html中查找字符串并希望它有效;它没有。然后有https,Http,hTtp,htTp,httP等依此类推等等。但这很容易,因为真正的恐怖跟随Uttam Sinha评论中的链接。

答案 6 :(得分:0)

正如其他人所指出的,最好使用正则表达式,数据检测器或解析库。但是,作为字符串处理的具体反馈:

Swift字符串的关键是拥抱它们的前向性。通常,整数索引和随机访问不是必需的。正如@ gnasher729指出的那样,每次调用count时,你都在迭代字符串。类似地,整数索引扩展是线性的,因此如果在循环中使用它们,则很容易意外地创建二次或三次复杂度算法。

但是在这种情况下,没有必要做所有工作来将字符串索引转换为随机访问整数。这是一个我认为正在执行类似逻辑的版本(寻找一个前缀,然后从那里寻找一个&#34;字符 - 忽略它不适合https,大写/小写等)仅使用本机字符串索引:

func extractAllLinks(text: String) -> [String] {
    var links: [String] = []
    let prefix = "http://"
    let prefixLen = count(prefix)

    for var idx = text.startIndex; idx != text.endIndex; ++idx {
        let candidate = text[idx..<text.endIndex]
        if candidate.hasPrefix(prefix),
           let closingQuote = find(candidate, "\"") {
            let link = candidate[candidate.startIndex..<closingQuote]
            links.append(link)
            idx = advance(idx, count(link))
        }
    }
    return links
}

let text = "This contains the link \"http://www.whatever.com/\" and"
         + " the link \"http://google.com\""

extractAllLinks(text)

如果有advance(idx, count())之类的其他助手,或者没有字符串切片的意愿并且手动滚动搜索,那么即使这可以进一步优化(findFromIndex效率也不高)结束人物。