将不等长度的嵌套JSON转换为R中的数据帧

时间:2017-11-23 20:03:08

标签: json r dataframe nested-lists

我有一个JSON文件'data.json',其中包含有关不同景点的信息。

data = lapply(readLines("data.json"), fromJSON)

这会创建一个具有不同长度的嵌套列表。这是前4行的样本。

list(structure(list(payload = structure(list(existence_full = 1L, 
    geo_virtual = "[\"56.9459720|-2.1971226|20|within_50m|4\"]", 
    latitude = "56.945972", locality = "Stonehaven", `_records_touched` = "{\"crawl\":8,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":0,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    address = "The Lodge, Dunottar", email = "dunnottarcastle@btconnect.com", 
    existence_ml = 0.569423821765872, domain_aggregate = "", 
    name = "Dunnottar Castle", search_tags = c("Dunnottar Castle Aberdeenshire", 
    "Dunotter Castle"), admin_region = "Scotland", existence = 1L, 
    category_labels = structure(c("Landmarks", "Buildings and Structures"
    ), .Dim = 1:2), post_town = "Stonehaven", region = "Kincardineshire", 
    review_count = "719", geocode_level = "within_50m", tel = "01569 762173", 
    placerank = 65L, longitude = "-2.197123", placerank_ml = 37.2791607346447, 
    fax = "01330 860325", category_ids_text_search = "", website = "http://www.dunnottarcastle.co.uk", 
    status = "1", geocode_confidence = "20", postcode = "AB39 2TL", 
    category_ids = 108L, country = "gb", `_geocode_quality` = "4"), .Names = c("existence_full", 
"geo_virtual", "latitude", "locality", "_records_touched", "address", 
"email", "existence_ml", "domain_aggregate", "name", "search_tags", 
"admin_region", "existence", "category_labels", "post_town", 
"region", "review_count", "geocode_level", "tel", "placerank", 
"longitude", "placerank_ml", "fax", "category_ids_text_search", 
"website", "status", "geocode_confidence", "postcode", "category_ids", 
"country", "_geocode_quality")), uuid = "3867aaf3-12ab-434f-b12b-5d627b3359c3"), .Names = c("payload", 
"uuid")), structure(list(payload = structure(list(existence_full = 1L, 
    geo_virtual = "[\"56.237480|-5.073578|20|within_50m|4\"]", 
    latitude = "56.237480", locality = "Inveraray", `_records_touched` = "{\"crawl\":11,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":0,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    address = "Cherry Park", email = "enquiries@inveraray-castle.com", 
    longitude = "-5.073578", domain_aggregate = "", name = "Inveraray Castle", 
    admin_region = "Scotland", search_tags = c("Inveraray Castle Tea Room", 
    "Inverary Castle"), existence = 1L, category_labels = structure(c("Social", 
    "Food and Dining", "Restaurants"), .Dim = c(1L, 3L)), region = "Argyll", 
    review_count = "532", geocode_level = "within_50m", tel = "01499 302203", 
    placerank = 67L, post_town = "Inveraray", placerank_ml = 41.1997808735227, 
    fax = "01499 302421", category_ids_text_search = "", website = "http://www.inveraray-castle.com", 
    status = "1", geocode_confidence = "20", postcode = "PA32 8XE", 
    category_ids = 347L, country = "gb", `_geocode_quality` = "4", 
    existence_ml = 0.791488110284778), .Names = c("existence_full", 
"geo_virtual", "latitude", "locality", "_records_touched", "address", 
"email", "longitude", "domain_aggregate", "name", "admin_region", 
"search_tags", "existence", "category_labels", "region", "review_count", 
"geocode_level", "tel", "placerank", "post_town", "placerank_ml", 
"fax", "category_ids_text_search", "website", "status", "geocode_confidence", 
"postcode", "category_ids", "country", "_geocode_quality", "existence_ml"
)), uuid = "8278ab80-2cd1-4dbd-9685-0d0036b681eb"), .Names = c("payload", 
"uuid")), structure(list(payload = structure(list(existence_full = 1L, 
    geo_virtual = "[\"51.483872|-0.606820|100|rooftop|2\"]", 
    latitude = "51.483872", locality = "Windsor Castle", hours_display = "Mon-Sat 11:30 AM-11:00 PM; Sun 12:00 PM-11:00 PM", 
    `_records_touched` = "{\"crawl\":7,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":2,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    address = "", longitude = "-0.606820", domain_aggregate = "", 
    name = "Windsor Castle", admin_region = "England", search_tags = c("The Windsor Castle", 
    "The Windsor Castle Pub", "The Windsor Castle Public House", 
    "Pub Food", "British"), existence = 1L, category_labels = structure(c("Landmarks", 
    "Buildings and Structures"), .Dim = 1:2), region = "Berkshire", 
    review_count = "", geocode_level = "rooftop", tel = "020 7766 7304", 
    placerank = 62L, post_town = "Windsor", placerank_ml = 28.1160845346327, 
    fax = "01753 832290", category_ids_text_search = "", website = "http://www.royalcollection.org.uk/visit/windsorcastle", 
    status = "1", hours = "{\"monday\":[[\"11:30\",\"23:00\"]],\"tuesday\":[[\"11:30\",\"23:00\"]],\"wednesday\":[[\"11:30\",\"23:00\"]],\"thursday\":[[\"11:30\",\"23:00\"]],\"friday\":[[\"11:30\",\"23:00\"]],\"saturday\":[[\"11:30\",\"23:00\"]],\"sunday\":[[\"12:00\",\"23:00\"]]}", 
    neighborhood = "Chalvey", geocode_confidence = "100", postcode = "SL4 1NJ", 
    category_ids = 108L, country = "gb", `_geocode_quality` = "2", 
    existence_ml = 0.885705196944165, email = "bookinginfo@royalcollection.org.uk"), .Names = c("existence_full", 
"geo_virtual", "latitude", "locality", "hours_display", "_records_touched", 
"address", "longitude", "domain_aggregate", "name", "admin_region", 
"search_tags", "existence", "category_labels", "region", "review_count", 
"geocode_level", "tel", "placerank", "post_town", "placerank_ml", 
"fax", "category_ids_text_search", "website", "status", "hours", 
"neighborhood", "geocode_confidence", "postcode", "category_ids", 
"country", "_geocode_quality", "existence_ml", "email")), uuid = "c5f7d8a9-0851-46ef-8da7-ad55e187d3a8"), .Names = c("payload", 
"uuid")), structure(list(payload = structure(list(existence_full = 1L, 
    category_ids_text_search = "", placerank_ml = 31.9857184762157, 
    longitude = "-2.191955", name = "Pitmedden Garden", domain_aggregate = "", 
    admin_region = "Scotland", languages = "English", region = "Aberdeenshire", 
    review_count = "2", geocode_level = "rooftop", tel = "01651 842352", 
    placerank = 57L, post_town = "Ellon", category_labels = structure(c("Landmarks", 
    "Gardens"), .Dim = 1:2), existence = 1L, fax = "0844 493 2102", 
    website = "http://www.nts.org.uk/Property/Pitmedden-Garden", 
    status = "1", geocode_confidence = "100", postcode = "AB41 7PD", 
    country = "gb", category_ids = 109L, `_geocode_quality` = "4", 
    existence_ml = 0.849871115334588, email = "information@nts.org.uk", 
    address = "", `_records_touched` = "{\"crawl\":6,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":0,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    locality = "Pitmedden", latitude = "57.343233", geo_virtual = "[\"57.343233|-2.191955|100|rooftop|4\"]"), .Names = c("existence_full", 
"category_ids_text_search", "placerank_ml", "longitude", "name", 
"domain_aggregate", "admin_region", "languages", "region", "review_count", 
"geocode_level", "tel", "placerank", "post_town", "category_labels", 
"existence", "fax", "website", "status", "geocode_confidence", 
"postcode", "country", "category_ids", "_geocode_quality", "existence_ml", 
"email", "address", "_records_touched", "locality", "latitude", 
"geo_virtual")), uuid = "bb57a153-740f-42be-aa4d-ae12d4eb57d4"), .Names = c("payload", 
"uuid")))

我想通过在列表列表中的不同列中填充值来将其转换为数据框。列表中的每个列表都包含有关特定位置的信息,按uuid进行分类。因此,数据框中的每一行都将包含有关特定uuid的信息。对于没有相应值的列,应显示NA。

我尝试使用类似于这个概念的问题中提到的一些方法但是没有成功。

任何想法都将不胜感激!感谢

1 个答案:

答案 0 :(得分:0)

它可能有助于对原始数据布局进行更广泛的描述,但这是一个猜测,基于我所看到的那个对象的高级结构。假设structure被命名为dat。:

> lapply(dat, names)
[[1]]
[1] "payload" "uuid"   

[[2]]
[1] "payload" "uuid"   

[[3]]
[1] "payload" "uuid"   

[[4]]
[1] "payload" "uuid" 

因此将它们提取到数据帧列表

payloads <- lapply(dat, function(x) data.frame(x$payload))
uuids <- lapply(dat, function(x) data.frame(x$uuid))

然后将它们绑在一起&#34;并排#34;

newdat <- mapply( cbind, payloads, uuids)

然后查看维度,以查看单行数据帧是否被正确复制到多行数据帧上。不符合您规格的一项功能是NA。因为&#39; uuids&#39;显然是标识符,cbind操作会将每个列内容复制到与&#39;有效负载相同长度的列中:

> lapply(payloads, dim)
[[1]]
[1]  2 32

[[2]]
[1]  2 33

[[3]]
[1]  5 35

[[4]]
[1]  1 32

> lapply(uuids, dim)
[[1]]
[1] 1 1

[[2]]
[1] 1 1

[[3]]
[1] 1 1

[[4]]
[1] 1 1

> lapply( mapply( cbind, payloads, uuids), dim)
[[1]]
[1]  2 33

[[2]]
[1]  2 34

[[3]]
[1]  5 36

[[4]]
[1]  1 33

下一级合并可能是将所有数据框组装在彼此之上,因为它们的名称非常相似:

lapply( newdat, names)
[[1]]
 [1] "existence_full"           "geo_virtual"              "latitude"                
 [4] "locality"                 "X_records_touched"        "address"                 
 [7] "email"                    "existence_ml"             "domain_aggregate"        
[10] "name"                     "search_tags"              "admin_region"            
[13] "existence"                "category_labels.1"        "category_labels.2"       
[16] "post_town"                "region"                   "review_count"            
[19] "geocode_level"            "tel"                      "placerank"               
[22] "longitude"                "placerank_ml"             "fax"                     
[25] "category_ids_text_search" "website"                  "status"                  
[28] "geocode_confidence"       "postcode"                 "category_ids"            
[31] "country"                  "X_geocode_quality"        "x.uuid"                  

[[2]]
 [1] "existence_full"           "geo_virtual"              "latitude"                
 [4] "locality"                 "X_records_touched"        "address"                 
 [7] "email"                    "longitude"                "domain_aggregate"        
[10] "name"                     "admin_region"             "search_tags"             
[13] "existence"                "category_labels.1"        "category_labels.2"       
[16] "category_labels.3"        "region"                   "review_count"            
[19] "geocode_level"            "tel"                      "placerank"               
[22] "post_town"                "placerank_ml"             "fax"                     
[25] "category_ids_text_search" "website"                  "status"                  
[28] "geocode_confidence"       "postcode"                 "category_ids"            
[31] "country"                  "X_geocode_quality"        "existence_ml"            
[34] "x.uuid"                  

[[3]]
 [1] "existence_full"           "geo_virtual"              "latitude"                
 [4] "locality"                 "hours_display"            "X_records_touched"       
 [7] "address"                  "longitude"                "domain_aggregate"        
[10] "name"                     "admin_region"             "search_tags"             
[13] "existence"                "category_labels.1"        "category_labels.2"       
[16] "region"                   "review_count"             "geocode_level"           
[19] "tel"                      "placerank"                "post_town"               
[22] "placerank_ml"             "fax"                      "category_ids_text_search"
[25] "website"                  "status"                   "hours"                   
[28] "neighborhood"             "geocode_confidence"       "postcode"                
[31] "category_ids"             "country"                  "X_geocode_quality"       
[34] "existence_ml"             "email"                    "x.uuid"                  

[[4]]
 [1] "existence_full"           "category_ids_text_search" "placerank_ml"            
 [4] "longitude"                "name"                     "domain_aggregate"        
 [7] "admin_region"             "languages"                "region"                  
[10] "review_count"             "geocode_level"            "tel"                     
[13] "placerank"                "post_town"                "category_labels.1"       
[16] "category_labels.2"        "existence"                "fax"                     
[19] "website"                  "status"                   "geocode_confidence"      
[22] "postcode"                 "country"                  "category_ids"            
[25] "X_geocode_quality"        "existence_ml"             "email"                   
[28] "address"                  "X_records_touched"        "locality"                
[31] "latitude"                 "geo_virtual"              "x.uuid"    

Hadley的rbind.fill包中的plyr函数可以有效地执行此操作:

install.packages("plyr")
newdat3 <-  do.call(plyr::rbind.fill, newdat)
newdat3

所以看一下几列,这似乎满足了你的要求:

> newdat3[ , c("locality", "category_labels.3", "neighborhood")]
         locality category_labels.3 neighborhood
1      Stonehaven              <NA>         <NA>
2      Stonehaven              <NA>         <NA>
3       Inveraray       Restaurants         <NA>
4       Inveraray       Restaurants         <NA>
5  Windsor Castle              <NA>      Chalvey
6  Windsor Castle              <NA>      Chalvey
7  Windsor Castle              <NA>      Chalvey
8  Windsor Castle              <NA>      Chalvey
9  Windsor Castle              <NA>      Chalvey
10      Pitmedden              <NA>         <NA>