我用r刮了一张维基百科表
library(rvest)
url <- "https://en.wikipedia.org/wiki/New_York_City"
nyc <- url %>%
read_html() %>%
html_node(xpath = '//*[@id="mw-content-text"]/div/table[1]') %>%
html_table(fill = TRUE)
并希望将值保存到新的数据框中。
输出
Area population
468.484 sq mi 8,336,817
做到这一点的最佳方法是什么?
答案 0 :(得分:2)
您需要选择哪个表。从表中选择所需的列和行。使用setNames
分配列名,并将rownames
设置为NULL
来重置。我确定您想要population
列as.integer
,只需在清除非数字之前使用gsub
。
我不确定html_node
行是否已删除。
library(rvest)
url <- "https://en.wikipedia.org/wiki/New_York_City"
nyc <- read_html(url)
# nyc <- html_node(nyc, xpath = '//*[@id="mw-content-text"]/div/table[1]')
nyc <- html_table(nyc, header=TRUE, fill = TRUE)
nyc <- `rownames<-`(
setNames(nyc[[3]][-c(1:2, 10), 2:3], c("area", "population")),
NULL)
nyc <- transform(nyc, population=as.integer(gsub("\\D", "", population)))
nyc
# area population
# 1 Bronx 1418207
# 2 Kings 2559903
# 3 New York 1628706
# 4 Queens 2253858
# 5 Richmond 476143
# 6 City of New York 8336817
# 7 State of New York 19453561
答案 1 :(得分:0)
从OP的示例输出来看,他们希望表在与问题中提供的xpath不同的xpath下给出。请参阅以下工作流程,注意:手动设置名称可以避免格式化行中字符串的麻烦:
# Initialise package in session: rvest => .GlobalEnv()
library(rvest)
# Store the url scalar: url => character vector
url <- "https://en.wikipedia.org/wiki/New_York_City"
# Scrape the table and store it memory: nyc => data.frame
nyc <-
url %>%
read_html() %>%
html_node(xpath = '/html/body/div[3]/div[3]/div[4]/div/table[3]') %>%
html_table(fill = TRUE) %>%
data.frame()
# Set the names appropriately: names(nyc) character vector
names(nyc) <- c("borough", "county", "pop_est_2019",
"gdp_bill_usd", "gdp_per_cap",
"land_area_sq_mi", "land_area_sq_km",
"density_pop_sq_mi", "density_pop_sq_km")
# Coerce the vectors to the appropriate type: cleaned => data.frame
cleaned <- data.frame(lapply(nyc[4:nrow(nyc)-1,], function(x){
if(length(grep("\\d+\\,\\d+$|^\\d+\\.\\d+$", x)) > 0){
as.numeric(trimws(gsub("\\,", "", as.character(x)), "both"))
}else{
as.factor(x)
}
}
)
)