R使用额外的java显示拉入html数据

时间:2016-08-17 16:05:20

标签: javascript html r

我想从538中提取数据,但是我想要通过点击“显示更多民意调查”得到的完整数据...该函数是否有办法访问表格的其他行? / p>

http://projects.fivethirtyeight.com/2016-election-forecast/national-polls/

提取顶级数据的代码是:

require(XML)

polls.html <- htmlTreeParse("http://projects.fivethirtyeight.com/2016-election-forecast/national-polls/",
useInternalNodes = TRUE)

parsedDoc <- readHTMLTable(polls.html, stringsAsFactors=FALSE)

pollData <- data.frame(parsedDoc[4])

1 个答案:

答案 0 :(得分:0)

我有点困惑为什么这个被贬低...对我来说似乎并不明显!但是对于任何想要了解解决方案的人来说,我确实已经弄明白了(感谢@duncantl的一些帮助)。 (此外,完整的分析位于:https://github.com/hardin47/prediction2016

require(XML)
require(dplyr)
require(tidyr)
require(readr)
require(mosaic)
require(RCurl)
require(ggplot2)
require(lubridate)
require(RJSONIO)

url = "http://projects.fivethirtyeight.com/2016-election-forecast/national-polls/"
doc <- htmlParse(url, useInternalNodes = TRUE)

sc = xpathSApply(doc, "//script[contains(., 'race.model')]", 
                 function(x) c(xmlValue(x), xmlAttrs(x)[["href"]]))

jsobj = gsub(".*race.stateData = (.*);race.pathPrefix.*", "\\1", sc)

data = fromJSON(jsobj)
allpolls <- data$polls

#unlisting the whole thing
indx <- sapply(allpolls, length)
pollsdf <- as.data.frame(do.call(rbind, lapply(allpolls, `length<-`, max(indx))))

#unlisting the weights
pollswt <- as.data.frame(t(as.data.frame(do.call(cbind, lapply(pollsdf$weight, data.frame, 
                                                 stringsAsFactors=FALSE)))))
names(pollswt) <- c("wtpolls", "wtplus", "wtnow")
row.names(pollswt) <- NULL

pollsdf <- cbind(pollsdf, pollswt)

#unlisting the voting
indxv <- sapply(pollsdf$votingAnswers, length)
pollsvot <- as.data.frame(do.call(rbind, lapply(pollsdf$votingAnswers,
                                               `length<-`, max(indxv))))
pollsvot1 <- rbind(as.data.frame(do.call(rbind, lapply(pollsvot$V1, data.frame,
                                                 stringsAsFactors=FALSE))))
pollsvot2 <- rbind(as.data.frame(do.call(rbind, lapply(pollsvot$V2, data.frame,
                                                 stringsAsFactors=FALSE))))


pollsvot1 <- cbind(polltype = rownames(pollsvot1), pollsvot1, 
                  polltypeA = gsub('[0-9]+', '', rownames(pollsvot1)),
                  polltype1 = extract_numeric(rownames(pollsvot1)))

pollsvot1$polltype1 <- ifelse(is.na(pollsvot1$polltype1), 1, pollsvot1$polltype1 + 1)


pollsvot2 <- cbind(polltype = rownames(pollsvot2), pollsvot2, 
                  polltypeA = gsub('[0-9]+', '', rownames(pollsvot2)),
                  polltype1 = extract_numeric(rownames(pollsvot2)))

pollsvot2$polltype1 <- ifelse(is.na(pollsvot2$polltype1), 1, pollsvot2$polltype1 + 1)


pollsdf <- pollsdf %>% 
  mutate(population = unlist(population), 
         sampleSize = as.numeric(unlist(sampleSize)), 
         pollster = unlist(pollster), 
         startDate = ymd(unlist(startDate)),
         endDate = ymd(unlist(endDate)), 
         pollsterRating = unlist(pollsterRating)) %>%
  select(population, sampleSize, pollster, startDate, endDate, pollsterRating,
         wtpolls, wtplus, wtnow)



allpolldata <- cbind(rbind(pollsdf[rep(seq_len(nrow(pollsdf)), each=3),],
                           pollsdf[rep(seq_len(nrow(pollsdf)), each=3),]), 
                     rbind(pollsvot1, pollsvot2))

allpolldata <- allpolldata %>%
  arrange(polltype1, choice)