从字符串中提取链接的优化

11

我从网站获取数据(HTML字符串),想要提取所有的链接。我已经编写了一个函数(它可以工作),但是速度很慢...

你能帮我优化一下吗?有哪些标准函数可以使用? 函数逻辑:在文本中查找"http:.//"字符串,然后读取字符串(按字符)直到我没有得到“\”为止。

extension String {

subscript (i: Int) -> Character {
    return self[advance(self.startIndex, i)]
}

subscript (i: Int) -> String {
    return String(self[i] as Character)
}

subscript (r: Range<Int>) -> String {
    return substringWithRange(Range(start: advance(startIndex, r.startIndex), end: advance(startIndex, r.endIndex)))
}}



func extractAllLinks(text:String) -> Array<String>{
var stringArray = Array<String>()
var find = "http://" as String

for (var i = countElements(find); i<countElements(text); i++)
{
    var ch:Character = text[i - Int(countElements(find))]
    if (ch == find[0])
    {
        var j = 0
        while (ch == find[j])
        {
            var ch2:Character = find[j]
            if(countElements(find)-1 == j)
            {
                break
            }
            j++
            i++
            ch = text[i - Int(countElements(find))]
        }

        i -= j
        if (j == (countElements(find)-1))
        {
            var str = ""
            for (; text[i - Int(countElements(find))] != "\""; i++)
            {
                str += text[i - Int(countElements(find))]
            }
            stringArray.append(str)
        }

    }
}
return stringArray}

你可以使用正则表达式。 - Victor Sigler
1
你可以尝试使用开源库解析HTML。http://www.raywenderlich.com/14172/how-to-parse-html-on-ios - Uttam Sinha
8个回答

31

如AdamPro13所说,可以使用NSDataDetector轻松获取所有URL,参见以下代码:

let text = "http://www.google.com. http://www.bla.com"
let types: NSTextCheckingType = .Link
var error : NSError?

let detector = NSDataDetector(types: types.rawValue, error: &error)        
var matches = detector!.matchesInString(text, options: nil, range: NSMakeRange(0, count(text)))

for match in matches {
   println(match.URL!)
}

它的输出结果为:

http://www.google.com
http://www.bla.com
已更新至Swift 2.0
let text = "http://www.google.com. http://www.bla.com"
let types: NSTextCheckingType = .Link

let detector = try? NSDataDetector(types: types.rawValue)

guard let detect = detector else {
   return
}

let matches = detect.matchesInString(text, options: .ReportCompletion, range: NSMakeRange(0, text.characters.count))

for match in matches {
    print(match.URL!)
}

请记得在上述情况中使用guard语句,它必须位于函数或循环内部。

希望这有所帮助。


是的,不用担心,这会帮助到另一个人。 - Victor Sigler
抱歉,无法处理不包含任何链接的文本。 我尝试了“hello swift” guard let detect = detector else { return } 但是没有起作用。 - Nahit Rüştü Heper
我遇到了以下错误: 无法使用类型为“(types: UInt64, error: inout NSError?)”的参数列表调用“NSDataDetector”的初始化程序 - Sachin Tanpure

25

这就是关于 Swift 5.0 的答案。

let text = "http://www.google.com. http://www.bla.com"

func checkForUrls(text: String) -> [URL] {
    let types: NSTextCheckingResult.CheckingType = .link

    do {
        let detector = try NSDataDetector(types: types.rawValue)

        let matches = detector.matches(in: text, options: .reportCompletion, range: NSMakeRange(0, text.count))
    
        return matches.compactMap({$0.url})
    } catch let error {
        debugPrint(error.localizedDescription)
    }

    return []
}

checkForUrls(text: text)

5

详情

  • Swift 5.2,Xcode 11.4(11E146)

解决方案

// MARK: DataDetector

class DataDetector {

    private class func _find(all type: NSTextCheckingResult.CheckingType,
                             in string: String, iterationClosure: (String) -> Bool) {
        guard let detector = try? NSDataDetector(types: type.rawValue) else { return }
        let range = NSRange(string.startIndex ..< string.endIndex, in: string)
        let matches = detector.matches(in: string, options: [], range: range)
        loop: for match in matches {
            for i in 0 ..< match.numberOfRanges {
                let nsrange = match.range(at: i)
                let startIndex = string.index(string.startIndex, offsetBy: nsrange.lowerBound)
                let endIndex = string.index(string.startIndex, offsetBy: nsrange.upperBound)
                let range = startIndex..<endIndex
                guard iterationClosure(String(string[range])) else { break loop }
            }
        }
    }

    class func find(all type: NSTextCheckingResult.CheckingType, in string: String) -> [String] {
        var results = [String]()
        _find(all: type, in: string) {
            results.append($0)
            return true
        }
        return results
    }

    class func first(type: NSTextCheckingResult.CheckingType, in string: String) -> String? {
        var result: String?
        _find(all: type, in: string) {
            result = $0
            return false
        }
        return result
    }
}

// MARK: String extension

extension String {
    var detectedLinks: [String] { DataDetector.find(all: .link, in: self) }
    var detectedFirstLink: String? { DataDetector.first(type: .link, in: self) }
    var detectedURLs: [URL] { detectedLinks.compactMap { URL(string: $0) } }
    var detectedFirstURL: URL? {
        guard let urlString = detectedFirstLink else { return nil }
        return URL(string: urlString)
    }
}

使用方法

let text = """
Lorm Ipsum is simply dummy text of the printing and typesetting industry. apple.com/ Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. http://gooogle.com. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. yahoo.com It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
"""

print(text.detectedLinks)
print(text.detectedFirstLink)
print(text.detectedURLs)
print(text.detectedFirstURL)

控制台输出

["apple.com/", "http://gooogle.com", "yahoo.com"]
Optional("apple.com/")
[apple.com/, http://gooogle.com, yahoo.com]
Optional(apple.com/)

嗨,伙计,我们如何避免电子邮件检测?我在运行你的代码时遇到了电子邮件问题。 - famfamfam
它几乎完美运作!..我发现使用含有表情符号的代码片段时出现了崩溃。 我用以下的for-in子句修复了它:for match in matches { guard iterationClosure((string as NSString).substring(with: match.range)) else { break } } - albirrojo7

4

非常有帮助的讨论!这是一个在 Swift 1.2 中可行的示例,基于Victor Sigler的答案。

    // extract first link (if available) and open it!
    let text = "How technology is changing our relationships to each other: http://t.ted.com/mzRtRfX"
    let types: NSTextCheckingType = .Link

    do {
        let detector = try NSDataDetector(types: types.rawValue)
        let matches = detector.matchesInString(text, options: .ReportCompletion, range: NSMakeRange(0, text.characters.count))
        if matches.count > 0 {
            let url = matches[0].URL!
            print("Opening URL: \(url)")
            UIApplication.sharedApplication().openURL(url)
        }

    } catch {
        // none found or some other issue
        print ("error in findAndOpenURL detector")
    }

更新了针对Swift 2.0的答案,包括错误处理和guard语句。希望这能帮到你。 - Victor Sigler

1

0

正如其他人所指出的,最好使用正则表达式、数据检测器或解析库。然而,关于您的字符串处理的具体反馈:

Swift 字符串的关键是要接受它们的单向性质。往往情况下,整数索引和随机访问并不必要。正如 @gnasher729 所指出的,每次调用 count 都会迭代字符串。同样,整数索引扩展是线性的,因此如果在循环中使用它们,很容易意外地创建一个二次或三次复杂度算法。

但在这种情况下,没有必要将字符串索引转换为随机访问整数。以下是我认为执行类似逻辑(查找前缀,然后从那里查找 " 字符 - 忽略 https、大小写等)仅使用本地字符串索引的版本:

func extractAllLinks(text: String) -> [String] {
    var links: [String] = []
    let prefix = "http://"
    let prefixLen = count(prefix)

    for var idx = text.startIndex; idx != text.endIndex; ++idx {
        let candidate = text[idx..<text.endIndex]
        if candidate.hasPrefix(prefix),
           let closingQuote = find(candidate, "\"") {
            let link = candidate[candidate.startIndex..<closingQuote]
            links.append(link)
            idx = advance(idx, count(link))
        }
    }
    return links
}

let text = "This contains the link \"http://www.whatever.com/\" and"
         + " the link \"http://google.com\""

extractAllLinks(text)

即使这样也可以进一步优化(advance(idx,count())有点低效),如果有其他辅助程序,例如findFromIndex等,或者愿意不使用字符串切片并手动滚动搜索结束字符。


谢谢!我理解这个想法,但是我不理解这段代码 let candidate = text[idx..<text.endIndex] if candidate.hasPrefix(prefix), let closingQuote = find(candidate, "\"") { let link = candidate[candidate.startIndex..<closingQuote] links.append(link) idx = advance(idx, count(link)) } 你能描述一下吗?(并修复它,我无法编译它) - Vasily Bodnarchuk
let candidate = text[idx..<text.endIndex] 创建一个从 idx 开始的子字符串。请注意,子字符串实际上只是对真实字符串的视图,它们不会创建一个新的字符串。然后 if candidate.hasPrefix(prefix), let closingQuote = find(candidate, "\"") 表示,如果该子字符串以 "http" 开头,则尝试在其中查找一个闭合引号的索引。如果找到了,那么它将从开头创建一个新的子字符串到那个闭合引号,并将其附加到数组中。最后,它将 idx 前移刚刚发现的 url 的长度。 - Airspeed Velocity

0

我想知道你是否意识到每次调用countElements函数时,都会调用一个复杂的主要函数,必须扫描字符串中的所有Unicode字符,并从中提取扩展字形簇并计数。如果你不知道什么是扩展字形簇,那么你应该能够想象这不是便宜的,而且是重度超负荷。

只需将其转换为NSString*,调用rangeOfString即可完成。

显然,你所做的完全不安全,因为http://并不意味着有链接。你不能只在html中查找字符串并希望它有效;它不起作用。然后还有https、Http、hTtp、htTp、httP等等。但这都很容易,对于真正的恐怖,请参阅Uttam Sinha评论中的链接。


谢谢!我使用常量 var textLength = countElements(text) var findLength = countElements(find)(所以我只使用了2次countElements)。结果-快了2倍。但是NSDataDetector的速度更快...我认为我会使用它。 - Vasily Bodnarchuk

0

Swift 5.8.1

import Foundation

extension String {
    var urls: [String] {
        var urlsArr = [String]()
        do {
            let detector = try NSDataDetector(types: NSTextCheckingResult.CheckingType.link.rawValue)
            detector.enumerateMatches(in: self, options: [], range: NSMakeRange(0, self.utf16.count), using: { (result, _, _) in
                if let nsrange = result?.range {
                    urlsArr.append((self as NSString).substring(with:nsrange))
                }
            })
        } catch let error {
            print(error.localizedDescription)
        }
        return urlsArr
    }
}

使用方法:

let str = " Hello world. https://www.google.com , blabla ❤️ apple.com, www.test.com"
print(str.urls) //["https://www.google.com", "apple.com", "www.test.com"]

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接