我有一列,其中包含不同类型内的多种数据。它是JSON风格的,但是我不太了解如何使用jsonlite
或tidyr::separate
来实现此功能。如何将其分成多列?
library(tidyverse)
tribble(~ID, ~data,
"A", "[['education', 'Ph.D., MIT'], ['interests', 'Econometrics, Causal Inference']]",
"B", "[['function', 'Social']]",
"C", "[['research_interests', 'S&P']]",
"D", "[['field', 'American Politics']]")
答案 0 :(得分:1)
我敢肯定,可能有一种更优雅的方法来做到这一点,但是看看是否能产生您想要的结果:
library(tidyverse)
data <- tribble(~ID, ~data,
"A", "[['education', 'Ph.D., MIT'], ['interests', 'Econometrics, Causal Inference']]",
"B", "[['function', 'Social']]",
"C", "[['research_interests', 'S&P']]",
"D", "[['field', 'American Politics']]")
column_names <- str_extract_all(data$data, "\\['(?<=').*?(?=')")
column_names <- map(column_names, ~ str_remove(.x, "\\['"))
names(column_names) <- data$ID
values <- str_extract_all(data$data, ",[:space:]'(?<=').*?(?=')")
values <- map(values, ~ str_remove(.x, ",[:space:]'"))
names(values) <- data$ID
val_df <- data.frame(values)%>%
gather("ID", "val")
col_df <- data.frame(column_names)%>%
gather("ID", "col")
bind_cols(col_df, val_df) %>%
distinct()%>%
spread(col, val, fill = NA)%>%
select(-ID1)
不幸的是,这种方法至少取决于两个假设:
我不确定是否可以扩展到您的其余数据,但是请告诉我是否可行。
如果一个ID在一个列中有两个记录,则您至少有两个选择:
paste
将两个值合并为一个值这是注释中的其他示例记录(假设我的编辑从“研究兴趣”更改为“研究兴趣 s ”,并假定它们与原始数据相同):
"E", "[['research_interests', 'American Politics'], ['research_interests', 'Democratization']]"
这应该给您两个'E'记录
# Replace last step of the original answer with this:
two_records <- bind_cols(col_df, val_df) %>%
distinct()%>%
group_by(col)%>%
mutate(grouped_id = row_number()) %>%
spread(col, val, fill = NA)%>%
select(-ID1, -grouped_id)
这看起来更像是您想要的原始输出
# Replace last step of original answer with this:
paste_records <- bind_cols(col_df, val_df) %>%
distinct()%>%
group_by(col)%>%
mutate(grouped_id = row_number()) %>%
spread(col, val, fill = NA)%>%
select(-ID1, -grouped_id)
paste_records <- paste_records %>%
split(paste_records$ID)%>%
map_df(mutate_if, function(x)length(unique(x))>1, .funs = list(function(x)paste(x,collapse = ", ")))%>%
distinct()
答案 1 :(得分:1)
类似于MillionC-这是一种蛮力方法,对数据进行了大量假设:
library(tidyverse)
tribble(~ID, ~data,
"A", "[['education', 'Ph.D., MIT'], ['interests', 'Econometrics, Causal Inference']]",
"B", "[['function', 'Social']]",
"C", "[['research_interests', 'S&P']]",
"D", "[['field', 'American Politics']]") -> df
df %>%
separate(data, into = c("x1", "x2"), sep = "\\], \\[") %>%
gather(x, data, -ID, na.rm = T) %>%
separate(data, into = c("k", "v"), sep = "', '") %>%
mutate_at(vars(k:v), ~gsub("\\[|]|'", "", .)) %>%
select(-x) %>%
spread(k, v)
#> # A tibble: 4 x 6
#> ID education field `function` interests research_intere…
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 A Ph.D., MIT <NA> <NA> Econometrics, Ca… <NA>
#> 2 B <NA> <NA> Social <NA> <NA>
#> 3 C <NA> <NA> <NA> <NA> S&P
#> 4 D <NA> American … <NA> <NA> <NA>
由reprex package(v0.2.1)于2019-04-26创建