从网页中提取表格

时间:2019-06-03 03:48:37

标签: r rvest

问题

我正在尝试在以下网页中下载表格: https://www.ato.gov.au/Rates/Individual-income-tax-for-prior-years/

我的尝试

/// A `UIAlertController` that can udpates its presenting view controller's `tintAdjustmentMode` code as it appears and disappears
class TintAdjustingAlertController: UIAlertController {
    override func viewWillAppear(_ animated: Bool) {
        super.viewWillAppear(animated)

        animatePresentingViewTintAdjustmentMode(tintAdjustmentMode: .dimmed, forViewControllerAtKey: .from)
    }

    override func viewWillDisappear(_ animated: Bool) {
        super.viewWillDisappear(animated)

        animatePresentingViewTintAdjustmentMode(tintAdjustmentMode: .automatic, forViewControllerAtKey: .to)
    }

    private func animatePresentingViewTintAdjustmentMode(tintAdjustmentMode mode: UIView.TintAdjustmentMode, forViewControllerAtKey key: UITransitionContextViewControllerKey) {
        transitionCoordinator?.animate(alongsideTransition: { context in
            if let presentingNavigationController = context.viewController(forKey: key) as? UINavigationController {
                presentingNavigationController.navigationBar.tintAdjustmentMode = mode
                presentingNavigationController.viewControllers.forEach { $0.view.tintAdjustmentMode = mode }
            } else if let presentingViewController = context.viewController(forKey: key) {
                presentingViewController.view.tintAdjustmentMode = mode
            }
            }, completion: nil)
    }
}

问题在于此代码返回639行数据。我希望导入的数据保持网页上具有的表结构之类的内容(例如表列表,甚至只是一个大数据框)。

1 个答案:

答案 0 :(得分:2)

我建议将它们保留为数据帧列表,并区分表,并用可用的caption命名

library(dplyr)
library(rvest)

url <- "https://www.ato.gov.au/Rates/Individual-income-tax-for-prior-years/"
url %>%
  read_html() %>%
  html_table() %>%
  setNames(., url %>%
               read_html() %>%
              html_nodes("caption") %>%
              html_text())


#$`Resident tax rates for 2016-17`
#      Taxable income                         Tax on this income
#1        0 – $18,200                                        Nil
#2  $18,201 – $37,000               19c for each $1 over $18,200
#3  $37,001 – $87,000 $3,572 plus 32.5c for each $1 over $37,000
#4 $87,001 – $180,000  $19,822 plus 37c for each $1 over $87,000
#5  $180,001 and over $54,232 plus 45c for each $1 over $180,000

#$`Resident tax rates for 2015-16`
#      Taxable income                         Tax on this income
#1        0 – $18,200                                        Nil
#2  $18,201 – $37,000               19c for each $1 over $18,200
#3  $37,001 – $80,000 $3,572 plus 32.5c for each $1 over $37,000
#4 $80,001 – $180,000  $17,547 plus 37c for each $1 over $80,000
#5  $180,001 and over $54,547 plus 45c for each $1 over $180,000
#......

如果您希望将其作为一个单独的数据帧,则可以将bind_rows.id参数一起使用

url %>%
  read_html() %>%
  html_table() %>%
  setNames(., url %>%
              read_html() %>%
              html_nodes("caption") %>%
              html_text()) %>%
   bind_rows(.id = "id")