我很难从[iea.org] [1]抓取数据表。我使用以下代码:
library("rvest")
url <- "http://www.iea.org/statistics/statisticssearch/report/?country=ZAMBIA&product=balances&year=2013"
energy <- url %>%
html() %>%
html_nodes(xpath='//*[@id="stats-container"]/div[2]/table') %>%
html_table()
head(energy)
R中的结果表只包含字母,而不是在表的单元格中包含数字。 感谢您的帮助。
答案 0 :(得分:1)
除非另有证明(或网站所有者阅读了如何使用robots.txt
并找到真正的律师来制定更明确和限制性的T&amp; Cs)......
我将从这个答案的非“tidyverse”解决方案开始:
library(rvest)
x <- read_html("http://www.iea.org/statistics/statisticssearch/report/?country=ZAMBIA&product=balances&year=2013")
# find the table; note that a less "structural" selector will generally make
# scraping code a bit less fragile.
xdf <- html_node(x, xpath=".//table[contains(., 'International marine')]")
xdf <- html_table(xdf)
# clean up column names
xdf <- janitor::clean_names(xdf)
现在,这些列按照OP和问题评论讨论所述进行编码:
xdf$oil_products
## [1] "MA==" "Mzkx" "LTUw" "MA==" "LTUy" "MA==" "Mjkw" "MA==" "MQ==" "LTEw"
## [11] "MA==" "MA==" "MA==" "NjAx" "MA==" "MA==" "MA==" "LTE1" "MA==" "ODY2"
## [21] "MzQ2" "MzMy" "MTI0" "Nw==" "NDI=" "MjY=" "MA==" "NTA=" "NjM=" "MA=="
==
将其作为base64编码(虽然评论中提到的URL进一步证实了这一点)。他们编码每个字符,所以我们需要先从b64转换它们然后转换为数字:
# decode each column
lapply(xdf[2:12], function(.x) {
as.numeric(
sapply(.x, function(.y) {
rawToChar(openssl::base64_decode(.y))
}, USE.NAMES=FALSE)
)
}) -> xdf[2:12]
快速str()
替代视图:
tibble::glimpse(xdf)
## Observations: 30
## Variables: 12
## $ x <chr> "Production", "Imports", "Exports", "International marine bunkers***", "International aviation bunkers***", "Stock c...
## $ coal <dbl> 88, 0, 0, 0, 0, 0, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ crude_oil <dbl> 0, 618, 0, 0, 0, 21, 639, 0, 0, 0, 0, 0, 0, -639, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ oil_products <dbl> 0, 391, -50, 0, -52, 0, 290, 0, 1, -10, 0, 0, 0, 601, 0, 0, 0, -15, 0, 866, 346, 332, 124, 7, 42, 26, 0, 50, 63, 0
## $ natural_gas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ nuclear <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ hydro <dbl> 1142, 0, 0, 0, 0, 0, 1142, 0, 0, -1142, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ geothermal_solar_etc <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ biofuels_and_waste <dbl> 7579, 0, 0, 0, 0, 0, 7579, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1661, 0, 0, 5918, 1479, 0, 4438, 4438, 0, 0, 0, 0, 0, 0
## $ electricity <dbl> 0, 6, -93, 0, 0, 0, -87, 0, 0, 1144, 0, 0, 0, 0, 0, 0, 0, -26, -98, 933, 549, 2, 382, 289, 59, 23, 0, 10, 0, 0
## $ heat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ total <dbl> 8809, 1016, -143, 0, -52, 21, 9651, 0, 1, -9, 0, 0, 0, -39, 0, 0, -1661, -41, -98, 7805, 2462, 335, 4945, 4734, 101,...
增强版画:
tibble::as_tibble(xdf)
## # A tibble: 30 x 12
## x coal crude_oil oil_products natural_gas nuclear hydro geothermal_solar_etc biofuels_and_waste electricity heat
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Production 88 0 0 0 0 1142 0 7579 0 0
## 2 Imports 0 618 391 0 0 0 0 0 6 0
## 3 Exports 0 0 -50 0 0 0 0 0 -93 0
## 4 International marine bunkers*** 0 0 0 0 0 0 0 0 0 0
## 5 International aviation bunkers*** 0 0 -52 0 0 0 0 0 0 0
## 6 Stock changes 0 21 0 0 0 0 0 0 0 0
## 7 TPES 88 639 290 0 0 1142 0 7579 -87 0
## 8 Transfers 0 0 0 0 0 0 0 0 0 0
## 9 Statistical differences 0 0 1 0 0 0 0 0 0 0
## 10 Electricity plants 0 0 -10 0 0 -1142 0 0 1144 0
## # ... with 20 more rows, and 1 more variables: total <dbl>
tidyverse有点清洁:
decode_cols <- function(.x) {
map_dbl(.x, ~{
openssl::base64_decode(.x) %>%
rawToChar() %>%
as.numeric()
})
}
html_node(x, xpath=".//table[contains(., 'International marine')]") %>%
html_table() %>%
janitor::clean_names() %>%
mutate_at(vars(-x), decode_cols)