readHTMLTable将数据帧值作为整数返回

时间:2016-03-13 23:02:08

标签: html r rvest html-tableextract

有人可以解释为什么下面标记的行返回数字而不是将列名设置为该行中的字符串吗?如果在阅读表时包含注释掉的colClasses行,我将如何获得正确的列名?

url<-'http://qpublic7.qpublic.net/ga_subdivison.php?county=ga_clarke&searchType=nbhd&numberValue=4025R&nameValue=&sectionValue=&townshipValue=&rangeValue=&startDate=01-1998&endDate=&startPrice=&endPrice=&startArea=&endArea=&startAcreage=&endAcreage=&saleQualification=All&saleVacant=All&propertyType=All&reasonType=All&start=0'
library(XML)
#colClasses = c("character","character","character","character","integer","integer","integer","character","character","integer","character","character"),
data<-readHTMLTable(url,header=F,as.data.frame=T)
View(data)
csv<-as.data.frame(data)
colnames(csv)<-csv[4,] #why does this line return numbers?
colnames(csv)<-gsub(" ","",colnames(csv))
View(head(csv))
csv<-csv[-c(1:4),]
#####
View(csv)

1 个答案:

答案 0 :(得分:1)

你有点stringsAsFactors。另外,您的as.data.frame来电不是正确的做法(使用strView查看数据结构。)

library(XML)

URL <- 'http://qpublic7.qpublic.net/ga_subdivison.php?county=ga_clarke&searchType=nbhd&numberValue=4025R&nameValue=&sectionValue=&townshipValue=&rangeValue=&startDate=01-1998&endDate=&startPrice=&endPrice=&startArea=&endArea=&startAcreage=&endAcreage=&saleQualification=All&saleVacant=All&propertyType=All&reasonType=All&start=0'

csv <- readHTMLTable(URL, header=FALSE, as.data.frame=TRUE, stringsAsFactors=FALSE)[[2]]

colnames(csv) <- csv[4,]
colnames(csv) <- gsub(" ", "", colnames(csv))

csv <- csv[-c(1:4),]

dplyr::glimpse(csv)

## Observations: 52
## Variables: 11
## $ \/ParcelNumber\/ (chr) "173C2 F023", "173C2 G009", "173C2 G007", "17...
## $ PropertyType       (chr) "R", "R", "R", "R", "R", "R", "R", "R", "R"...
## $ SaleDate           (chr) "12-2015", "08-2015", "08-2015", "07-2015",...
## $ SalePrice          (chr) "200,000", "265,000", "210,000", "188,000",...
## $ HeatedSqFt         (chr) "1,538", "1,756", "1,415", "1,125", "1,559"...
## $ Acres              (chr) "0.30", "0.37", "0.37", "0.38", "0.32", "0....
## $ SaleQual           (chr) "Q", "Q", "Q", "Q", "Q", "Q", "U", "Q", "Q"...
## $ Reason             (chr) "FM", "FM", "FM", "FM", "FM", "FM", "B", "F...
## $ YearBuilt          (chr) "1952", "1954", "1963", "1963", "1998", "19...
## $ LocationAddress    (chr) "155 HARDIN DR", "140 HARDIN DR", "150 HARD...
## $ Neighborhood       (chr) "4025R-RIVERDALE", "4025R-RIVERDALE", "4025...


# or use the more modern xml2 ---------------------------------------------

library(xml2)
library(rvest)

pg <- read_html(URL)

csv2 <- html_table(html_nodes(pg, "table")[[2]], fill=TRUE)

colnames(csv2) <- csv[4,]
colnames(csv2) <- gsub(" ", "", colnames(csv))

csv2 <- csv2[-c(1:4), -c(12:13)]

dplyr::glimpse(csv2)

## Observations: 52
## Variables: 11
## $ \/ParcelNumber\/ (chr) "173C2 F023 ", "173C2 G009 ", "173C2 G007 ", ...
## $ PropertyType       (chr) "R ", "R ", "R ", "R ", "R ", "R ", "R ", "...
## $ SaleDate           (chr) "12-2015 ", "08-2015 ", "08-2015 ", "07-201...
## $ SalePrice          (chr) "200,000 ", "265,000 ", "210,000 ", "188,00...
## $ HeatedSqFt         (chr) "1,538 ", "1,756 ", "1,415 ", "1,125 ", "1,...
## $ Acres              (chr) "0.30 ", "0.37 ", "0.37 ", "0.38 ", "0.32 "...
## $ SaleQual           (chr) "Q ", "Q ", "Q ", "Q ", "Q ", "Q ", "U ", "...
## $ Reason             (chr) "FM ", "FM ", "FM ", "FM ", "FM ", "FM ", "...
## $ YearBuilt          (chr) "1952 ", "1954 ", "1963 ", "1963 ", "1998 "...
## $ LocationAddress    (chr) "155 HARDIN DR ", "140 HARDIN DR ", "150 HA...
## $ Neighborhood       (chr) "4025R-RIVERDALE ", "4025R-RIVERDALE ", "40...