Question

我在尝试阅读从R中的URL获取的一些JSON数据时遇到了很多麻烦。我能够读取数据，并调用每个观察值来获取值（作为字符）这很好，但我似乎无法找到一种以表格形式获取数据的方法（基本上就像在excel中一样）。

我试图创建一个循环，调用每个字段将其放在一个空矩阵中，但不是每个对象都有相同数量的字段（即某些值具有Label1和Label2，而其他值只有LABEL1）。我得到了subcipts超出范围的错误。我想的是制作一个条件语句，而如果该字段存在，那么该字段的值将被放入数据矩阵中，如果该字段不存在，那么我将插入一个NA。我自动得到一个下标错误但无法进行条件评估 - 我已经查看是否可以强制错误成为NA，但我不认为这是可能的。

我从j = 3开始索引，因为我不需要JSON代码中的前两个观察。我的问题是，例如＆＃34; json $ poi [[j]] $ label [[2]] $ value＆＃34; 可能不存在每次观察，我自动得到一个错误代码遇到了缺少此字段的第一个观察结果。

数据相当大 - 大约4480个观测值，每个最多20个场。我只需要我列出的9个字段。这是数据URL的链接 - 加载可能需要一些时间。我对编码很新，特别是尝试处理JSON文件，所以我道歉，如果这有一个我没有看到的简单解决方案。

谢谢！

http://tourism.citysdk.cm-lisboa.pt/pois/?limit=-1

library(rjson)
library(RCurl)

json <- fromJSON(getURL('http://tourism.citysdk.cm-lisboa.pt/pois/?limit=-1'))

ljson <- length(json$poi)-2
data <- matrix(data=NA, nrow=ljson, ncol=9)

for(i in 1:ljson)
{
j <- i+2

d1 <- json$poi[[j]]$location$point[[1]]$Point$posList
d2 <- json$poi[[j]]$label[[1]]$value
d3 <- json$poi[[j]]$label[[2]]$value
d4 <- json$poi[[j]]$category[[1]]$value
d5 <- json$poi[[j]]$category[[2]]$value
d6 <- json$poi[[j]]$id
d7 <- json$poi[[j]]$author$value
d8 <- json$poi[[j]]$license$value
d9 <- json$poi[[j]]$description[[1]]$value

if(exists("d1") == TRUE){
    d1 <- json$poi[[j]]$location$point[[1]]$Point$posList 
} else {
    d1 <- NA 
} 
if(exists("d2") == TRUE){
    d2 <- json$poi[[j]]$label[[1]]$value 
} else {
    d2 <- NA 
}
if(exists("d3") == TRUE){
    d3 <- json$poi[[j]]$label[[2]]$value
} else {
    d3 <- NA 
}
if(exists("d4") == TRUE){
    d4 <- json$poi[[j]]$category[[1]]$value 
} else {
    d4 <- NA 
}
if(exists("d5") == TRUE){
    d5 <- json$poi[[j]]$category[[2]]$value 
} else {
    d5 <- NA 
}
if(exists("d6") == TRUE){
    d6 <- json$poi[[j]]$id 
} else {
    d6 <- NA 
}
if(exists("d7") == TRUE){
    d7 <- json$poi[[j]]$author$value 
} else {
    d7 <- NA 
}
if(exists("d8") == TRUE){
    d8 <- json$poi[[j]]$license$value 
} else {
    d8 <- NA 
}
if(exists("d9") == TRUE){
    d9 <- json$poi[[j]]$description[[1]]$value 
} else {
    d9 <- NA 
}
data[i,] <- rbind(c(d1,d2,d3,d4,d5,d6,d7,d8,d9))    
}

Answer 1

对于JSON＆amp; XML列表结构str是你的朋友！您可以使用它来检查列表结构的全部或部分。关于要提取的单个组件的sapply可能比for构造更好，并且您需要处理NULL和缺少的子结构组件以从该JSON构建数据框（实际上是很多JSON文件）。以下内容可帮助您入门，但仍有一些工作要做：

# simplify extraction (saves typing, too)
poi <- json$poi

# start at 3rd element
poi <- poi[3:length(poi)] 

# have to do some special checking since the value isn't always there
poi_points <- sapply(poi, function(x) {
  if ("point" %in% names(x$location) & length(x$location$point) > 0) {
    x$location$point[[1]]$Point$posList
  } else {
    NA
  }
})

# this removes NULLs which the data.frame call won't like later
poi_description <- sapply(poi, function(x) {
  if (is.null(x$description[[1]]$value)) {
    NA
  } else {
    x$description[[1]]$value 
  }
})

# this removes NULLs which the data.frame call won't like later
poi_category <- sapply(poi, function(x) {
  if (is.null(x$category[[1]]$value)) {
    NA
  } else {
    x$category[[1]]$value 
  }
})

# simpler extractions

poi_label <- sapply(poi, function(x) x$label[[1]]$value)
poi_id <- sapply(poi, function(x) x$id)
poi_author <- sapply(poi, function(x) x$author$value)
poi_license <- sapply(poi, function(x) x$license$value)

# make a data frame
poi <- data.frame(poi_label, poi_category, poi_id, poi_points, poi_author, poi_license, poi_description)

str(poi)

## 'data.frame': 4482 obs. of  7 variables:
##  $ poi_label      : Factor w/ 4482 levels "\"Bloco das Águas Livres\", edifício de habitação, comércio e serviços",..: 363 765 764 1068 174 419 461 762 420 412 ...
##  $ poi_category   : Factor w/ 129 levels "Acessórios de Uso Pessoal",..: 33 33 33 33 33 33 123 33 33 33 ...
##  $ poi_id         : Factor w/ 4482 levels "52d7bf4d723e8e0b0cc08b69",..: 2 3 4 5 7 8 15 16 17 18 ...
##  $ poi_points     : Factor w/ 3634 levels "38.405892 -9.93503",..: 975 244 478 416 301 541 2936 2975 2850 2830 ...
##  $ poi_author     : Factor w/ 1 level "CitySDK": 1 1 1 1 1 1 1 1 1 1 ...
##  $ poi_license    : Factor w/ 1 level "open-data": 1 1 1 1 1 1 1 1 1 1 ...
##  $ poi_description: Factor w/ 2831 levels "","\n","\n\n",..: 96 1051 NA NA 777 1902 NA 1038 81 82 ...
##

使用不同数量的字段解析R中的JSON URL

1 个答案: