我必须从网站下载几个表,表格ID为“tabela”,我尝试了各种函数XML::readHTMLTable
,XML::xmlTreeParse
,但只有rvest
个包加载它:
require(rvest)
url="http://www.pse.pl/index.php?modul=21&id_rap=2&data=2013-01-01"
wpkd <- html(url)
class(wpkd)
[1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" "XMLAbstractDocument"
str(wpkd)
Classes 'HTMLInternalDocument', 'HTMLInternalDocument', 'XMLInternalDocument', 'XMLAbstractDocument' <externalptr>
现在我想用“tabela”id提取表格或将wpkd
保存为纯文本并尝试低级提取。
无法正确识别wpkd
的结构:
> wpkd %>% xml_structure()
{DTD}
<html>
<head>
<title> {text}
<meta [http-equiv, content]>
<meta [name, content]>
<meta [name, content]>
<link [rel, type, title, href]>
<meta [name, content]>
<meta [name, content]>
<meta [http-equiv, content]>
<meta [http-equiv, content]>
<link [rel, type, href]>
<link [rel, href, type]>
<link [rel, type, href]>
<link [rel, href, type, media]>
<link [rel, href, type, media]>
<link [rel, href, type, media]>
<script [src]>
<script [src]>
Error: Unknown input XMLInternalCommentNode/XMLInternalNode/XMLAbstractNode
答案 0 :(得分:1)
鉴于&#34;标题&#34;不统一(跨越TR
s)这是一种方法(它不是唯一的方法):
library(rvest)
library(magrittr)
library(dplyr)
pg <- html("http://www.pse.pl/index.php?modul=21&id_rap=2&data=2013-01-01")
# small function to extract by column
get_col <- function(doc, i) {
skip <- ifelse(i==8, -1, -2) # last column is "wonky"
doc %>%
html_nodes(xpath=sprintf("//table[@id='tabela']/tr/td[%d]", i)) %>%
extract(-1:skip) %>% # skip the useless "TR"s
html_text()
}
# manually build data frame, which actually gives you better column names
data.frame(time=pg %>% get_col(1),
demand=pg %>% get_col(2),
capacity_jwcd=pg %>% get_col(3),
capacity_njwcd=pg %>% get_col(4),
generation_jwcd=pg %>% get_col(5),
generation_njwcd=pg %>% get_col(6),
reserve_over=pg %>% get_col(7),
reserve_below=pg %>% get_col(8),
stringsAsFactors=FALSE) -> energy
glimpse(energy)
## Observations: 24
## Variables:
## $ time (chr) "1", "2", "3", "4", "5", "6", "7", "8", "9", "10...
## $ demand (chr) "14 650", "14 000", "13 325", "12 850", "12 575"...
## $ capacity_jwcd (chr) "21 032", "21 032", "21 032", "21 032", "21 032"...
## $ capacity_njwcd (chr) "8 918", "8 918", "8 918", "8 918", "8 918", "8 ...
## $ generation_jwcd (chr) "7 085", "6 446", "5 777", "5 307", "5 031", "4 ...
## $ generation_njwcd (chr) "7 565", "7 554", "7 548", "7 543", "7 544", "7 ...
## $ reserve_over (chr) "1 328", "1 269", "1 209", "1 166", "1 141", "1 ...
## $ reserve_below (chr) "-1 328", "-1 269", "-1 209", "-1 166", "-1 141"...
您需要自己进行类型转换(即使您使用了其中一个自动表功能,您也会这样做。)