我正在下载并解压缩包含xml的gz文件。当我使用readLines将xml读入xmlParse时,它只返回标题。但在文件中有一个完整的结构
解压缩的xml文件可以下载here
library(hvest)
url.base="http://prices.shufersal.co.il/FileObject/UpdateCategory?catID=0&storeId=0&page="
max.page=as.numeric(gsub("[^1-9]","",html(paste0(url.base,1))%>%html_nodes(xpath="//div[@id='gridContainer']/table/tfoot/tr/td/a[6]")%>%html_attr("href")))
shufersal.url=html(paste0(url.base,max.page))%>%html_nodes("a")%>%html_attr("href")
shufersal.url=shufersal.url[grepl("Store",shufersal.url)]
temp <- tempfile()
download.file(shufersal.url,temp,quiet = T,mode="wb")
readLines(gzfile(temp),encoding = "UTF-8")
[1] "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
xmlParse(readLines(gzfile(temp),encoding = "UTF-8"))
Start tag expected, '<' not found
Error: 1: Start tag expected, '<' not found
unlink(temp)
答案 0 :(得分:0)
#I think you can get the contents without using gzfile (not sure why?);
kk<-readLines(temp,encoding = "UTF-8")
jj<-xmlParse(readLines(temp,encoding = "UTF-8"))