编辑和过滤R

时间:2018-04-02 12:17:38

标签: r json dplyr tidyr jsonlite

我正在尝试显示此数据集 - > https://mtgjson.com/json/AllSets.json.zip

但是,我希望将数据展平,以便它不会嵌套在列表中,列表内的列表中的一堆JSON数据中。

更具体地说,我试图将数据显示为数据帧,按$releaseDate(其中一个变量)的顺序排列。

到目前为止,这是我的尝试:

library(jsonlite)
library(tidyjson)
mtgdata <- fromJSON("~/path/to/file.json")

mtgdata的结果显示了这个列表列表:

summary(mtgdata)
        Length Class  Mode
UST       9     -none- list
UNH      10     -none- list
UGL      11     -none- list
pWOS      8     -none- list
pWOR      8     -none- list
pWCQ      8     -none- list
pSUS      8     -none- list
pSUM     10     -none- list
pREL      8     -none- list
pPRO      8     -none- list
pPRE      8     -none- list
pPOD      7     -none- list
pMPR      8     -none- list
pMGD      8     -none- list
pMEI      8     -none- list
pLPA      8     -none- list
pLGM      8     -none- list
pJGP     10     -none- list
pHHO     11     -none- list
pWPN      8     -none- list
pGTW      8     -none- list
pGRU     10     -none- list
pGPX      8     -none- list
pFNM     10     -none- list
pELP      8     -none- list
pDRC      7     -none- list
pCMP      8     -none- list
pCEL      8     -none- list
pARL      8     -none- list
pALP     10     -none- list
p2HG      8     -none- list
p15A      8     -none- list
PD3       9     -none- list
PD2       9     -none- list
H09       9     -none- list
PTK      12     -none- list
POR      12     -none- list
PO2      13     -none- list
PCA       7     -none- list
PC2      10     -none- list
HOP      10     -none- list
VMA       9     -none- list
MMA      10     -none- list
MM3       8     -none- list
MM2      11     -none- list
MED       9     -none- list
ME4       9     -none- list
ME3       9     -none- list
ME2       9     -none- list
IMA       8     -none- list
EMA       9     -none- list
A25       8     -none- list
MPS_AKH   8     -none- list
MPS       9     -none- list
EXP       9     -none- list
E02       7     -none- list
V17       8     -none- list
V16       7     -none- list
V15       9     -none- list
V14       9     -none- list
V13       9     -none- list
V12      10     -none- list
V11      10     -none- list
V10       9     -none- list
V09      10     -none- list
DRB       9     -none- list
EVG       9     -none- list
DDT       7     -none- list
DDS       7     -none- list
DDR       7     -none- list
DDQ       8     -none- list
DDP      10     -none- list
DDO      10     -none- list
DDN      10     -none- list
DDM      10     -none- list
DDL      10     -none- list
DDK      10     -none- list
DDJ      10     -none- list
DDI      10     -none- list
DDH      10     -none- list
DDG      10     -none- list
DDF      10     -none- list
DDE      10     -none- list
DDD       9     -none- list
DDC       9     -none- list
DD3_JVC   9     -none- list
DD3_GVL   9     -none- list
DD3_EVG   9     -none- list
DD3_DVD   9     -none- list
DD2      11     -none- list
CNS      11     -none- list
CN2       9     -none- list
CMD      11     -none- list
CMA       7     -none- list
CM1      10     -none- list
C17       6     -none- list
C16       8     -none- list
C15      10     -none- list
C14      10     -none- list
C13      10     -none- list
CEI       9     -none- list
CED       9     -none- list
E01       7     -none- list
ARC       9     -none- list
ZEN      12     -none- list
XLN      12     -none- list
WWK      12     -none- list
WTH      13     -none- list
W17       8     -none- list
W16       8     -none- list
VIS      13     -none- list
VAN       8     -none- list
USG      13     -none- list
ULG      13     -none- list
UDS      13     -none- list
TSP      12     -none- list
TSB      12     -none- list
TPR      11     -none- list
TOR      12     -none- list
TMP      13     -none- list
THS      12     -none- list
STH      13     -none- list
SOM      12     -none- list
SOK      12     -none- list
SOI      10     -none- list
SHM      12     -none- list
SCG      12     -none- list
S99      11     -none- list
S00      11     -none- list
RTR      12     -none- list
RQS       6     -none- list
ROE      12     -none- list
RIX      12     -none- list
RAV      12     -none- list
PLS      13     -none- list
PLC      12     -none- list
PCY      13     -none- list
ORI      11     -none- list
ONS      12     -none- list
OGW      10     -none- list
ODY      13     -none- list
NPH      12     -none- list
NMS      14     -none- list
MRD      12     -none- list
MOR      12     -none- list
MMQ      13     -none- list
MIR      13     -none- list
MGB      10     -none- list
MD1       9     -none- list
MBS      12     -none- list
M15      11     -none- list
M14      11     -none- list
M13      11     -none- list
M12      11     -none- list
M11      11     -none- list
M10      11     -none- list
LRW      12     -none- list
LGN      12     -none- list
LEG      12     -none- list
LEB      11     -none- list
LEA      11     -none- list
KTK      12     -none- list
KLD       9     -none- list
JUD      12     -none- list
JOU      12     -none- list
ITP      11     -none- list
ISD      12     -none- list
INV      13     -none- list
ICE      13     -none- list
HOU       9     -none- list
HML      12     -none- list
GTC      12     -none- list
GPT      12     -none- list
FUT      12     -none- list
FRF_UGIN 10     -none- list
FRF      12     -none- list
FEM      11     -none- list
EXO      13     -none- list
EVE      12     -none- list
EMN       9     -none- list
DTK      12     -none- list
DST      12     -none- list
DRK      12     -none- list
DPA       9     -none- list
DKM       9     -none- list
DKA      12     -none- list
DIS      12     -none- list
DGM      12     -none- list
CST      11     -none- list
CSP      12     -none- list
CP3       7     -none- list
CP2       7     -none- list
CP1       7     -none- list
CON      13     -none- list
CHR      11     -none- list
CHK      12     -none- list
BTD      10     -none- list
BRB      10     -none- list
BOK      12     -none- list
BNG      12     -none- list
BFZ      12     -none- list
AVR      12     -none- list
ATQ      11     -none- list
ATH       9     -none- list
ARN      11     -none- list
ARB      12     -none- list
APC      13     -none- list
ALL      13     -none- list
ALA      12     -none- list
AKH       9     -none- list
AER       9     -none- list
9ED      12     -none- list
8ED      12     -none- list
7ED      12     -none- list
6ED      12     -none- list
5ED      12     -none- list
5DN      12     -none- list
4ED      12     -none- list
3ED      12     -none- list
2ED      11     -none- list
10E      11     -none- list

在每个列表中都有我感兴趣的分析变量,以便对这些数据进行过滤和排序,就像它是一个展平的数据帧一样。

当我们检查其中一个列表中的变量列表时(以“mtgdata $ UST”为例),我们得到了这组变量:

names(mtgdata$UST)
[1] "name"        "code"        "releaseDate" "border"      "type"        
"booster"     "mkm_name"   
[8] "mkm_id"      "cards"

在mtgdata(“mtgdata $ SOI”)中的另一个列表上运行相同的查询,我们得到另一组变量,尽管它们大多是相同的。

正如我上面提到的,我主要对通过mtgdata $ releaseDate展平此数据集和排名感兴趣 - 但就目前而言,“$ releaseDate”目前嵌套在第一组列表中(“$ UST”等)。 )

我对此问题的帮助或如何更好地重新解释这个问题将不胜感激。

1 个答案:

答案 0 :(得分:0)

您可以在命令行上尝试类似this的内容,将JSON对象数组转换为文件ndjson记录,然后使用类似ndjson::stream_in("filename_of the_thing_you_just_converted")的内容,但最终会得到14,000+列,相当无用,“扁平”的数据框架。

相反,做一些洞察:

library(tidyverse)

as1 <- jsonlite::read_json("~/Downloads/AllSets.json")

str(as1, 1) 
## List of 221
##  $ UST     :List of 9
##  $ UNH     :List of 10
##  $ UGL     :List of 11
##  $ pWOS    :List of 8
##  $ pWOR    :List of 8
##  $ pWCQ    :List of 8
##  $ pSUS    :List of 8
##  $ pSUM    :List of 10
##  $ pREL    :List of 8
##  $ pPRO    :List of 8
##  $ pPRE    :List of 8
##  $ pPOD    :List of 7
##  $ pMPR    :List of 8
##  $ pMGD    :List of 8
##  $ pMEI    :List of 8
##  $ pLPA    :List of 8
##  $ pLGM    :List of 8
##  $ pJGP    :List of 10
##  $ pHHO    :List of 11
## ...

呃...其中一个“那些”JSON文件看起来不适合让每个记录的所有元素都填充,即使整个文件 - 理论上 - 应该是一致的。

让我们看看哪些JSON数组元素填充的字段数最多,因为这意味着可能已经填充了所有字段:

map_dbl(as1, length) %>% 
  broom::tidy() %>% 
  arrange(desc(x))
## # A tibble: 221 x 2
##    names     x
##    <chr> <dbl>
##  1 NMS    14.0
##  2 PO2    13.0
##  3 WTH    13.0
##  4 VIS    13.0
##  5 USG    13.0
##  6 ULG    13.0
##  7 UDS    13.0
##  8 TMP    13.0
##  9 STH    13.0
## 10 PLS    13.0
## # ... with 211 more rows

我们来看看NMS

str(as1[["NMS"]], 1)
## List of 14
##  $ name              : chr "Nemesis"
##  $ code              : chr "NMS"
##  $ gathererCode      : chr "NE"
##  $ magicCardsInfoCode: chr "ne"
##  $ oldCode           : chr "NEM"
##  $ releaseDate       : chr "2000-02-14"
##  $ border            : chr "black"
##  $ type              : chr "expansion"
##  $ block             : chr "Masques"
##  $ booster           :List of 15
##  $ translations      :List of 5
##  $ mkm_name          : chr "Nemesis"
##  $ mkm_id            : int 32
##  $ cards             :List of 143

确实不想展平boostertranslationscards,并应将其保留为list列和{{1必要时。

但是,由于每个记录都有不同的字段,我们不能简单地“data.table :: rbindlist()unnest dplyr :: bind_rows()”,因为它会抱怨这些列中的一些。

我们必须逐个记录并将每个转换为数据框,处理丢失的字段并将or个字段包装在list中。我们将使用辅助函数简化函数习惯,以测试缺失值:

list()

^^比使用`%l0%` <- function(x, y) if (length(x) > 0) x else y 的{​​{1}}更健壮。

最后:

%||%

而且,你可以看到结果:

purrr

并且,我们可以在将列转换为正确的map_df(as1, ~{ data_frame( name = .x$name %l0% NA_character_, code = .x$code, gathererCode = .x$gathererCode %l0% NA_character_, magicCardsInfoCode = .x$magicCardsInfoCode %l0% NA_character_, oldCode = .x$oldCode %l0% NA_character_, releaseDate = .x$releaseDate %l0% NA_character_, border = .x$border, type = .x$type, block = .x$block %l0% NA_character_, booster = list(.x$booster), translations = list(.x$translations), mkm_name = .x$mkm_name %l0% NA_character_, mkm_id = .x$mkm_id %l0% NA_character_, cards = list(.x$cards) ) }) -> all_sets 对象后按all_sets ## # A tibble: 221 x 14 ## name code gathererCode magicCardsInfoC… oldCode releaseDate border type block booster ## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <list> ## 1 Unstable UST NA NA NA 2017-12-08 silver un NA <list [… ## 2 Unhinged UNH NA uh NA 2004-11-20 silver un NA <list [… ## 3 Unglued UGL UG ug NA 1998-08-11 silver un NA <list [… ## 4 Wizards of th… pWOS NA wotc NA 1999-09-04 black promo NA <NULL> ## 5 Worlds pWOR NA wrl NA 1999-08-04 black promo NA <NULL> ## 6 World Magic C… pWCQ NA wmcq NA 2013-04-06 black promo NA <NULL> ## 7 Super Series pSUS NA sus NA 1999-12-01 black promo NA <NULL> ## 8 Summer of Mag… pSUM NA sum NA 2007-07-21 black promo NA <NULL> ## 9 Release Events pREL NA rep NA 2003-07-26 black promo NA <NULL> ## 10 Pro Tour pPRO NA pro NA 2007-02-09 black promo NA <NULL> ## # ... with 211 more rows, and 4 more variables: translations <list>, mkm_name <chr>, mkm_id <int>, ## # cards <list> glimpse(all_sets) ## Observations: 221 ## Variables: 14 ## $ name <chr> "Unstable", "Unhinged", "Unglued", "Wizards of the Coast Online Store"... ## $ code <chr> "UST", "UNH", "UGL", "pWOS", "pWOR", "pWCQ", "pSUS", "pSUM", "pREL", "... ## $ gathererCode <chr> NA, NA, "UG", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... ## $ magicCardsInfoCode <chr> NA, "uh", "ug", "wotc", "wrl", "wmcq", "sus", "sum", "rep", "pro", "pt... ## $ oldCode <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... ## $ releaseDate <chr> "2017-12-08", "2004-11-20", "1998-08-11", "1999-09-04", "1999-08-04", ... ## $ border <chr> "silver", "silver", "silver", "black", "black", "black", "black", "bla... ## $ type <chr> "un", "un", "un", "promo", "promo", "promo", "promo", "promo", "promo"... ## $ block <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA... ## $ booster <list> [["rare", "uncommon", "uncommon", "uncommon", "common", "common", "co... ## $ translations <list> [NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NU... ## $ mkm_name <chr> "Unstable", "Unhinged", "Unglued", NA, NA, NA, NA, "Summer Magic", NA,... ## $ mkm_id <int> 1821, 59, 22, NA, NA, NA, NA, 76, NA, NA, NA, NA, NA, NA, NA, NA, NA, ... ## $ cards <list> [[["Andrea Radeck", 1, ["W"], ["White"], "95ebdf85f4ea74d584dfdfb72e3... 排列它们:

releaseDate