我想从538中提取数据,但是我想要通过点击“显示更多民意调查”得到的完整数据...该函数是否有办法访问表格的其他行? / p>
http://projects.fivethirtyeight.com/2016-election-forecast/national-polls/
提取顶级数据的代码是:
require(XML)
polls.html <- htmlTreeParse("http://projects.fivethirtyeight.com/2016-election-forecast/national-polls/",
useInternalNodes = TRUE)
parsedDoc <- readHTMLTable(polls.html, stringsAsFactors=FALSE)
pollData <- data.frame(parsedDoc[4])
答案 0 :(得分:0)
我有点困惑为什么这个被贬低...对我来说似乎并不明显!但是对于任何想要了解解决方案的人来说,我确实已经弄明白了(感谢@duncantl的一些帮助)。 (此外,完整的分析位于:https://github.com/hardin47/prediction2016)
require(XML)
require(dplyr)
require(tidyr)
require(readr)
require(mosaic)
require(RCurl)
require(ggplot2)
require(lubridate)
require(RJSONIO)
url = "http://projects.fivethirtyeight.com/2016-election-forecast/national-polls/"
doc <- htmlParse(url, useInternalNodes = TRUE)
sc = xpathSApply(doc, "//script[contains(., 'race.model')]",
function(x) c(xmlValue(x), xmlAttrs(x)[["href"]]))
jsobj = gsub(".*race.stateData = (.*);race.pathPrefix.*", "\\1", sc)
data = fromJSON(jsobj)
allpolls <- data$polls
#unlisting the whole thing
indx <- sapply(allpolls, length)
pollsdf <- as.data.frame(do.call(rbind, lapply(allpolls, `length<-`, max(indx))))
#unlisting the weights
pollswt <- as.data.frame(t(as.data.frame(do.call(cbind, lapply(pollsdf$weight, data.frame,
stringsAsFactors=FALSE)))))
names(pollswt) <- c("wtpolls", "wtplus", "wtnow")
row.names(pollswt) <- NULL
pollsdf <- cbind(pollsdf, pollswt)
#unlisting the voting
indxv <- sapply(pollsdf$votingAnswers, length)
pollsvot <- as.data.frame(do.call(rbind, lapply(pollsdf$votingAnswers,
`length<-`, max(indxv))))
pollsvot1 <- rbind(as.data.frame(do.call(rbind, lapply(pollsvot$V1, data.frame,
stringsAsFactors=FALSE))))
pollsvot2 <- rbind(as.data.frame(do.call(rbind, lapply(pollsvot$V2, data.frame,
stringsAsFactors=FALSE))))
pollsvot1 <- cbind(polltype = rownames(pollsvot1), pollsvot1,
polltypeA = gsub('[0-9]+', '', rownames(pollsvot1)),
polltype1 = extract_numeric(rownames(pollsvot1)))
pollsvot1$polltype1 <- ifelse(is.na(pollsvot1$polltype1), 1, pollsvot1$polltype1 + 1)
pollsvot2 <- cbind(polltype = rownames(pollsvot2), pollsvot2,
polltypeA = gsub('[0-9]+', '', rownames(pollsvot2)),
polltype1 = extract_numeric(rownames(pollsvot2)))
pollsvot2$polltype1 <- ifelse(is.na(pollsvot2$polltype1), 1, pollsvot2$polltype1 + 1)
pollsdf <- pollsdf %>%
mutate(population = unlist(population),
sampleSize = as.numeric(unlist(sampleSize)),
pollster = unlist(pollster),
startDate = ymd(unlist(startDate)),
endDate = ymd(unlist(endDate)),
pollsterRating = unlist(pollsterRating)) %>%
select(population, sampleSize, pollster, startDate, endDate, pollsterRating,
wtpolls, wtplus, wtnow)
allpolldata <- cbind(rbind(pollsdf[rep(seq_len(nrow(pollsdf)), each=3),],
pollsdf[rep(seq_len(nrow(pollsdf)), each=3),]),
rbind(pollsvot1, pollsvot2))
allpolldata <- allpolldata %>%
arrange(polltype1, choice)