我拥有佛蒙特州彩票网站(this page)页面的所有HTML代码。在那个页面中,有一个巨大的表:
我使用此代码(在Playground中)成功获取该页面的HTML:
import UIKit
import Foundation
import PlaygroundSupport
let url:URL = URL(string: "https://vtlottery.com/games/instant-tickets/outstanding-prizes")!
let session = URLSession.shared
let request = NSMutableURLRequest(url: url)
request.httpMethod = "POST"
request.cachePolicy = NSURLRequest.CachePolicy.reloadIgnoringCacheData
let paramString = "data=Hello"
request.httpBody = paramString.data(using: String.Encoding.utf8)
let task = session().dataTask(with: request as URLRequest) {
(
data, response, error) in
guard let data = data, let _:URLResponse = response where error == nil else {
print("error")
return
}
let dataString = NSString(data: data, encoding: String.Encoding.utf8.rawValue)
print(dataString!)
}
task.resume()
PlaygroundPage.current.needsIndefiniteExecution = true
如何从表中获取数据,以便将其转换为巨大的JSON哈希表(或类似表)?
基本上,我希望该图表的HTML代码变成类似:
"row 1": {
"Price":"$20",
"Game #":"1333",
"Game Name":"Money"
"Top Prizes":["$150000", "$5000", "$500"],
"Unclaimed Top Prizes":["2", "16", "246"],
"Total Unclaimed":"$2479510",
"% Sold": "1",
"# Of Tickets":"183600"
},
"row 2": {
"Price":"$10",
"Game #":"1339",
"Game Name":"Diamonds & Pearls"
"Top Prizes":["$50000", "$5000", "$500"],
"Unclaimed Top Prizes":["3", "5", "174"],
"Total Unclaimed":"$1264925",
"% Sold": "4",
"# Of Tickets":"201650"
}, ... (etc.)
我如何开始解析这些数据?
答案 0 :(得分:1)
您需要一个HTML解析器来浏览响应HTML。甚至不要考虑使用正则表达式。下面的答案使用HTMLReader,您可以通过CocoaPods将其添加到项目中。游乐场一直让我崩溃所以我将代码转换为IBAction
代替:
import UIKit
import MapKit
import HTMLReader
class ViewController: UIViewController {
var result = [String: [String: AnyObject]]()
override func viewDidLoad() {
super.viewDidLoad()
}
override func didReceiveMemoryWarning() {
super.didReceiveMemoryWarning()
// Dispose of any resources that can be recreated.
}
@IBAction func loadHTML(_ sender: AnyObject) {
let url = URL(string: "https://vtlottery.com/games/instant-tickets/outstanding-prizes")!
let session = URLSession.shared
let request = NSMutableURLRequest(url: url)
request.httpMethod = "POST"
request.cachePolicy = NSURLRequest.CachePolicy.reloadIgnoringCacheData
let paramString = "data=Hello"
request.httpBody = paramString.data(using: String.Encoding.utf8)
let task = session().dataTask(with: request as URLRequest) { data, response, error in
guard let data = data, let _ = response where error == nil else {
print("error")
return
}
var index = 0
// Many columns has newlines, tabs or spaces for their
// textual content. Here, define a character set to trim
// them off
let spaceCharacterSet = CharacterSet(charactersIn: "\n\t ")
let html = HTMLDocument(data: data, contentTypeHeader: "text/html; charset=utf-8")
for node in html.nodes(matchingSelector: "#tblData tbody tr") {
let columns = node.nodes(matchingSelector: "td")
let topPrices = columns[3].nodes(matchingSelector: "p").map { $0.textContent }
let unclaimedTopPrices = columns[4].nodes(matchingSelector: "p").map { $0.textContent }
// You have to open the Web Inspector in Safari to grab
// the table's structure
let rowData: [String: AnyObject] = [
"Price" : columns[0].textContent.trimmingCharacters(in : spaceCharacterSet),
"Game #" : columns[1].textContent.trimmingCharacters(in : spaceCharacterSet),
"Game Name" : columns[2].textContent.trimmingCharacters(in : spaceCharacterSet),
"Top Prices" : topPrices,
"Unclaimed Top Prizes" : unclaimedTopPrices,
"Total Unclaimed" : columns[5].textContent.trimmingCharacters(in : spaceCharacterSet),
"% Sold" : columns[6].textContent.trimmingCharacters(in : spaceCharacterSet),
"# of tickets" : columns[7].textContent.trimmingCharacters(in : spaceCharacterSet)
]
index += 1
self.result["row \(index)"] = rowData
}
// It's better if you put a breakpoint on the line below.
// Swift 3's logging is too verbose at the moment.
print(self.result["row 1"]!["Unclaimed Top Prizes"])
}
task.resume()
}
}