当我在R中废弃HTML表格时,为什么要获取字符串而不是整数?

时间:2018-02-18 15:03:41

标签: r web-scraping html-table rvest

我很难从[iea.org] [1]抓取数据表。我使用以下代码:

library("rvest")
url <- "http://www.iea.org/statistics/statisticssearch/report/?country=ZAMBIA&product=balances&year=2013"
energy <- url %>%
  html() %>%
  html_nodes(xpath='//*[@id="stats-container"]/div[2]/table') %>%
  html_table()
head(energy)

R中的结果表只包含字母,而不是在表的单元格中包含数字。 感谢您的帮助。

1 个答案:

答案 0 :(得分:1)

除非另有证明(或网站所有者阅读了如何使用robots.txt并找到真正的律师来制定更明确和限制性的T&amp; Cs)......

我将从这个答案的非“tidyverse”解决方案开始:

library(rvest)

x <- read_html("http://www.iea.org/statistics/statisticssearch/report/?country=ZAMBIA&product=balances&year=2013")

# find the table; note that a less "structural" selector will generally make  
# scraping code a bit less fragile.
xdf <- html_node(x, xpath=".//table[contains(., 'International marine')]")
xdf <- html_table(xdf)

# clean up column names
xdf <- janitor::clean_names(xdf)

现在,这些列按照OP和问题评论讨论所述进行编码:

xdf$oil_products
##  [1] "MA==" "Mzkx" "LTUw" "MA==" "LTUy" "MA==" "Mjkw" "MA==" "MQ==" "LTEw"
## [11] "MA==" "MA==" "MA==" "NjAx" "MA==" "MA==" "MA==" "LTE1" "MA==" "ODY2"
## [21] "MzQ2" "MzMy" "MTI0" "Nw==" "NDI=" "MjY=" "MA==" "NTA=" "NjM=" "MA=="

==将其作为base64编码(虽然评论中提到的URL进一步证实了这一点)。他们编码每个字符,所以我们需要先从b64转换它们然后转换为数字:

# decode each column
lapply(xdf[2:12], function(.x) {
  as.numeric(
    sapply(.x, function(.y) {
      rawToChar(openssl::base64_decode(.y))
    }, USE.NAMES=FALSE)
  )
}) -> xdf[2:12]

快速str()替代视图:

tibble::glimpse(xdf)
## Observations: 30
## Variables: 12
## $ x                    <chr> "Production", "Imports", "Exports", "International marine bunkers***", "International aviation bunkers***", "Stock c...
## $ coal                 <dbl> 88, 0, 0, 0, 0, 0, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ crude_oil            <dbl> 0, 618, 0, 0, 0, 21, 639, 0, 0, 0, 0, 0, 0, -639, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ oil_products         <dbl> 0, 391, -50, 0, -52, 0, 290, 0, 1, -10, 0, 0, 0, 601, 0, 0, 0, -15, 0, 866, 346, 332, 124, 7, 42, 26, 0, 50, 63, 0
## $ natural_gas          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ nuclear              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ hydro                <dbl> 1142, 0, 0, 0, 0, 0, 1142, 0, 0, -1142, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ geothermal_solar_etc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ biofuels_and_waste   <dbl> 7579, 0, 0, 0, 0, 0, 7579, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1661, 0, 0, 5918, 1479, 0, 4438, 4438, 0, 0, 0, 0, 0, 0
## $ electricity          <dbl> 0, 6, -93, 0, 0, 0, -87, 0, 0, 1144, 0, 0, 0, 0, 0, 0, 0, -26, -98, 933, 549, 2, 382, 289, 59, 23, 0, 10, 0, 0
## $ heat                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ total                <dbl> 8809, 1016, -143, 0, -52, 21, 9651, 0, 1, -9, 0, 0, 0, -39, 0, 0, -1661, -41, -98, 7805, 2462, 335, 4945, 4734, 101,...

增强版画:

tibble::as_tibble(xdf)
## # A tibble: 30 x 12
##                                    x  coal crude_oil oil_products natural_gas nuclear hydro geothermal_solar_etc biofuels_and_waste electricity  heat
##                                <chr> <dbl>     <dbl>        <dbl>       <dbl>   <dbl> <dbl>                <dbl>              <dbl>       <dbl> <dbl>
##  1                        Production    88         0            0           0       0  1142                    0               7579           0     0
##  2                           Imports     0       618          391           0       0     0                    0                  0           6     0
##  3                           Exports     0         0          -50           0       0     0                    0                  0         -93     0
##  4   International marine bunkers***     0         0            0           0       0     0                    0                  0           0     0
##  5 International aviation bunkers***     0         0          -52           0       0     0                    0                  0           0     0
##  6                     Stock changes     0        21            0           0       0     0                    0                  0           0     0
##  7                              TPES    88       639          290           0       0  1142                    0               7579         -87     0
##  8                         Transfers     0         0            0           0       0     0                    0                  0           0     0
##  9           Statistical differences     0         0            1           0       0     0                    0                  0           0     0
## 10                Electricity plants     0         0          -10           0       0 -1142                    0                  0        1144     0
## # ... with 20 more rows, and 1 more variables: total <dbl>

tidyverse有点清洁:

decode_cols <- function(.x) {
  map_dbl(.x, ~{
    openssl::base64_decode(.x) %>% 
      rawToChar() %>% 
      as.numeric()
  })
}

html_node(x, xpath=".//table[contains(., 'International marine')]") %>% 
  html_table() %>% 
  janitor::clean_names() %>% 
  mutate_at(vars(-x), decode_cols)