我已经从YELP公共数据挑战中下载了一组JSON文件:https://www.yelp.com/dataset/challenge
它们提供NDJSON格式的文件。我已经能够使用
阅读它们library(jsonlite)
df <- stream_in(file("file_path"))
不幸的是,仍然有一些属性列似乎是嵌套的data.frames,如果不非常手动地创建新列就无法解析它们。
示例:
df$attributes$BusinessParking
是一个字符列,其中包含:
{'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}
此列中有NA值。我希望能够将其解析为5个二进制列。有没有办法做到这一点,我很想念?我是R的新手,但我做了一些挖掘工作,还没有遇到任何解决方案。
答案 0 :(得分:0)
您可以使用$
访问器运算符来重新确定列。基于Maurits Evers关于样本大小〜3 Gb的评论,我基于可用的Yelp Dataset JSON, business.json数据样本创建了一个示例(请参阅文章末尾)。另外,您将需要使用Categories
将paste0
连接到字符向量,以避免对每个JSON实体进行多行记录。
yelp.R
library(jsonlite)
df <- jsonlite::fromJSON("business.json")
df$RestaurantsTakeOut <- df$attributes$RestaurantsTakeOut
df_bp <- df$attributes$BusinessParking
df_wh <- df$hours
df <- cbind(df, df_bp, df_wh)
df$categories <- sapply(df$categories, paste0, collapse = ", ")
df$attributes <- NULL
df$hours <- NULL
str(df)
输出:
'data.frame': 2 obs. of 26 variables:
$ business_id : chr "tnhfDv5Il8EaGSXZGiuQGg" "tnhfDv5Il8EaGSXZGiuQGg"
$ name : chr "Garaje" "Garaje"
$ neighborhood : chr "SoMa" "SoMa"
$ address : chr "475 3rd St" "475 3rd St"
$ city : chr "San Francisco" "San Francisco"
$ state : chr "CA" "CA"
$ postal code : chr "94107" "94107"
$ latitude : num 37.8 37.8
$ longitude : num -122 -122
$ stars : num 4.5 4.5
$ review_count : int 1198 1198
$ is_open : int 1 1
$ categories : chr "Mexican, Burgers, Gastropubs" "Mexican, Burgers, Gastropubs"
$ RestaurantsTakeOut: logi TRUE TRUE
$ garage : logi FALSE FALSE
$ street : logi TRUE TRUE
$ validated : logi FALSE FALSE
$ lot : logi FALSE FALSE
$ valet : logi FALSE FALSE
$ Monday : chr "10:00-21:00" "10:00-21:00"
$ Tuesday : chr "10:00-21:00" "10:00-21:00"
$ Friday : chr "10:00-21:00" "10:00-21:00"
$ Wednesday : chr "10:00-21:00" "10:00-21:00"
$ Thursday : chr "10:00-21:00" "10:00-21:00"
$ Sunday : chr "11:00-18:00" "11:00-18:00"
$ Saturday : chr "10:00-21:00" "10:00-21:00"
business.json
[{
"business_id": "tnhfDv5Il8EaGSXZGiuQGg",
"name": "Garaje",
"neighborhood": "SoMa",
"address": "475 3rd St",
"city": "San Francisco",
"state": "CA",
"postal code": "94107",
"latitude": 37.7817529521,
"longitude": -122.39612197,
"stars": 4.5,
"review_count": 1198,
"is_open": 1,
"attributes": {
"RestaurantsTakeOut": true,
"BusinessParking": {
"garage": false,
"street": true,
"validated": false,
"lot": false,
"valet": false
}
},
"categories": [
"Mexican",
"Burgers",
"Gastropubs"
],
"hours": {
"Monday": "10:00-21:00",
"Tuesday": "10:00-21:00",
"Friday": "10:00-21:00",
"Wednesday": "10:00-21:00",
"Thursday": "10:00-21:00",
"Sunday": "11:00-18:00",
"Saturday": "10:00-21:00"
}
},
{
"business_id": "tnhfDv5Il8EaGSXZGiuQGg",
"name": "Garaje",
"neighborhood": "SoMa",
"address": "475 3rd St",
"city": "San Francisco",
"state": "CA",
"postal code": "94107",
"latitude": 37.7817529521,
"longitude": -122.39612197,
"stars": 4.5,
"review_count": 1198,
"is_open": 1,
"attributes": {
"RestaurantsTakeOut": true,
"BusinessParking": {
"garage": false,
"street": true,
"validated": false,
"lot": false,
"valet": false
}
},
"categories": [
"Mexican",
"Burgers",
"Gastropubs"
],
"hours": {
"Monday": "10:00-21:00",
"Tuesday": "10:00-21:00",
"Friday": "10:00-21:00",
"Wednesday": "10:00-21:00",
"Thursday": "10:00-21:00",
"Sunday": "11:00-18:00",
"Saturday": "10:00-21:00"
}
}]