使用rowspan值提取html表

时间:2015-12-06 19:52:48

标签: xml r html-table rcurl

我使用以下代码创建的数据框(使用 RCurl XML 包)将三个字母的团队缩写放入其跨越的第一行。我可以添加另一个包或附加代码以将数据保存在正确的列中吗?

library(XML)
library(RCurl)
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
url_source <- readLines(url, encoding = "UTF-8")
playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors = F, header = T) [2])

4 个答案:

答案 0 :(得分:1)

这是一个答案。我冒充了数据

library(dplyr)
library(XML)
library(RCurl)
library(stringi)
library(zoo)
library(tidyr)

initial_data =
  "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams" %>%
  readLines(encoding = "UTF-8") %>%
  readHTMLTable(stringsAsFactors = F) %>%
  `[[`(2) %>%
  mutate(ID = 1:n(),
         test =
           V1 %>%
           stri_detect_regex("^[A-Z]{2,3}$"))

variable_names = c("franchise",
                   "years",
                   "initial_postseason_appearances")

shifts = 
  initial_data %>%
  filter(test %>% `!`) %>%
  setNames(c(variable_names,
             "trash",
             "ID",
             "test"))

team_initial =
  initial_data %>%
  filter(test) %>%
  setNames(c("initial_abbreviation",
             variable_names,
             "ID",
             "test")) %>%
  bind_rows(shifts) %>%
  arrange(ID) %>%
  separate(years, c("start", "end")) %>%
  mutate(abbreviation = initial_abbreviation %>% na.locf,
         split_postseason_appearances =
           initial_postseason_appearances %>%
           plyr::mapvalues("–", NA) %>%
           stri_split_fixed(", ") )

appearance = 
  team_initial %>%
  select(franchise,
         split_postseason_appearances) %>%
  unnest(split_postseason_appearances) %>%
  mutate(postseason_appearance =
           split_postseason_appearances %>%
           extract_numeric) %>%
  select(-split_postseason_appearances)

team = 
  team_initial %>%
  select(abbreviation,
         franchise,
         start,
         end)

答案 1 :(得分:1)

考虑一个XML包解决方案,需要使用带有for循环和if/then逻辑的xpathSApply()的各种XPath表达式。要捕获行跨越表记录,请使用各种XPath字符串函数:string-length(), concat(), and substring()

library(XML)

# PARSE FROM URL
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
webpage <- readLines(url)
html = htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)

# INITIALIZE LISTS
code <- c()
team <- c()
year <- c()
postseason <- c()

# APPEND TO LISTS LOOPING ACROSS ALL TEAMS
numberofteams <- length(xpathSApply(html, "//table[2]//tr/td[1]"))

for (i in (1:numberofteams+1)) {
  # TR NODES WITH LETTER TEAM ABBREVIATION (STRING LENGTH=2 or 3)
  if (as.character(xpathSApply(html, sprintf("string-length(//table[2]/tr[%s]/td[1])", i), xmlValue)) %in% c("2","3")) {

    code <- c(code, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
    team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
    year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))
    postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[4]", i), xmlValue))
  } else {
    # TR NODES W/O LETTER TEAM ABBREVIATION       
    code <- c(code, xpathSApply(html, sprintf("substring(concat(//table[2]/tr[position()=%s-1]/td[position()=1 and string-length(.)=3],
                                               //table[2]/tr[position()=%s-2]/td[position()=1 and string-length(.)=3]), 1, 3)", i, i), xmlValue))
    team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
    year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
    postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))        

  }
}

# COMBINE LISTS INTO DATA FRAME
playoffs <- data.frame(code = unlist(code), 
                       team = unlist(team), 
                       year = unlist(year), 
                       postseason = unlist(postseason))

答案 2 :(得分:1)

你实际上非常接近。您需要做的唯一事情是将数据放在正确的列和行中,因为某些行已向左移动。您可以按照以下方式实现此目的(在 data.table zoo 包的帮助下):

# your original code
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
url_source <- readLines(url, encoding = "UTF-8")
playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors = F, header = T)[2])

# assigning proper names to the columns
names(playoffs) <- c("shortcode","franchise","years","appearances")

# 1. shift the dat columnwise for the rows in which there is no shortcode
# 2. fill the resulting NA's with the last observation
# 3. only keep the last shortcode when the previous ones are the same
#    because only there the shortcode matches the franchise name
library(data.table)
library(zoo)
setDT(playoffs)[nchar(shortcode) > 3, `:=` (shortcode = NA,
                                            franchise = shortcode,
                                            years = franchise,
                                            appearances = years)
                ][, shortcode := na.locf(shortcode)
                  ][shortcode == shift(shortcode, 1L, type="lead"), shortcode := NA]

答案 3 :(得分:1)

尝试使用htmltab:

install.packages("htmltab")
library(htmltab)

purl <- htmlParse(url_source)
htmltab(purl, which = 2)