使用此
我有一个文件名filenames <- list.files(getwd(), full.names=FLASE)
> filenames
[1] "2007_acura_mdx" "2007_acura_rdx" "2007_acura_rl"
[4] "2007_acura_tl" "2007_acura_tsx" "2007_audi_a3"
[7] "2007_audi_a4" "2007_audi_a6" "2007_audi_a8"
[10] "2007_audi_q7" "2007_audi_rs4" "2007_audi_s8"
而且,我想申请
> for (filename in filenames) {
+ sample <- readChar(filename, file.info(filename)$size )
+ }
Error in readChar(filename, file.info(filename)$size) :
invalid UTF-8 input in readChar()
但是,即使我能读取一个文件,也会出现错误。
library(XML)
xmlfilename <- "2007_acura_tl"
xmlTxt <- readChar(xmlfilename, file.info(xmlfilename)$size)
txt <- gsub(pattern ="\\&", replacement ="and", xmlTxt)
txt2 <- paste("<root>", txt, "</root>")
doc <- xmlTreeParse(txt2, asText = TRUE, useInternalNodes = TRUE)
L <- xpathApply(doc, "//DOC", xmlApply, FUN = xmlValue)
dd <- do.call(rbind, lapply(L, as.data.frame, stringsAsFactors = FALSE))
DOCNO <- xpathApply(doc, "//DOCNO", xmlApply, FUN = xmlValue)
resultSet <- cbind(DOCNO, dd)
names(resultSet)[1]<-paste("model")
我的最终目标是
请提供一些提示。 在此先感谢:)
答案 0 :(得分:0)
您可以使用“宽容”模式读取read_html
中的XML来读取该数据并生成数据框:
library(dplyr)
library(xml2)
doc <- read_html("OpinRankDataset/cars/2007/2007_acura_mdx")
data_frame(
date=xml_text(xml_find_all(doc, "//doc/date")),
author=xml_text(xml_find_all(doc, "//doc/author")),
text=xml_text(xml_find_all(doc, "//doc/text")),
favorite=xml_text(xml_find_all(doc, "//doc/favorite"))
) -> dat
glimpse(dat)
## Variables: 4
## $ date (chr) "07/31/2009", "07/30/2009", "06/22/2009", "04/13/2009", "04/06/20...
## $ author (chr) "FlewByU", "cvillemdx", "Pleased", "wasatch7", "mnozek", "Debce",...
## $ text (chr) "I just moved to Germany two months ago and bought an 07 MDX from...
## $ favorite (chr) "The separate controls for the rear passengers are awesome. I can...
你也可以这样做:
bind_rows(lapply(xml_find_all(doc, "//doc"),
function(x) {
setNames(rbind.data.frame(xml_text(xml_children(x))), c("date", "author", "text", "favorite"))
}))
如果您希望代码更少“手工劳动”,
library(XML)
doc <- htmlParse("OpinRankDataset/cars/2007/2007_acura_mdx")
xmlToDataFrame(nodes=getNodeSet(doc, "//doc"))
如果更习惯使用XML包。