我有一个数据框,如:
# A tibble: 2 x 3
id text_1 text_2
<int> <chr> <chr>
1 1 "{1=>{:name=>\"aaa\", :priority=>0, :count=>4}, 7=… "{:name=>\"bbb\", :priority=>0, :count=>4}, {:name=>\"ddd\", :…
2 2 "{1=>{:name=>\"aaa\", :priority=>0, :count=>5}, 3=… "{:name=>\"bbb\", :priority=>0, :count=>4}, {:name=>\"ccc\", :…
可重现:
structure(list(id = 1:2, text_1 = c("{1=>{:name=>\"aaa\", :priority=>0, :count=>4}, 7=>{:name=>\"bbb\", :priority=>0, :count=>2}}",
"{1=>{:name=>\"aaa\", :priority=>0, :count=>5}, 3=>{:name=>\"ccc\", :priority=>0, :count=>3}}"
), text_2 = c("{:name=>\"bbb\", :priority=>0, :count=>4}, {:name=>\"ddd\", :priority=>0, :count=>2}",
"{:name=>\"bbb\", :priority=>0, :count=>4}, {:name=>\"ccc\", :priority=>0, :count=>2}, {:name=>\"ddd\", :priority=>0, :count=>9}"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
), spec = structure(list(cols = list(id = structure(list(), class = c("collector_integer",
"collector")), text_1 = structure(list(), class = c("collector_character",
"collector")), text_2 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector"))), class = "col_spec"))
其中每行包含要分析的字符串。
第一列包含带有标识符的字段,第二列包含在一组括号之间。
我希望通过将id
级别不相同的乘积设置为零的方式来实现此结果。
产品数量有限,因此需要使用所有组合进行填充。
# A tibble: 14 x 5
id product priority count level
<int> <chr> <int> <int> <chr>
1 1 aaa 0 4 text_1
2 1 bbb 0 4 text_1
3 1 ccc 0 0 text_1
4 2 aaa 0 5 text_1
5 2 bbb 0 0 text_1
6 2 ccc 0 3 text_1
7 1 aaa 0 0 text_2
8 1 bbb 0 4 text_2
9 1 ccc 0 0 text_2
10 1 ddd 0 2 text_2
11 2 aaa 0 0 text_2
12 2 bbb 0 4 text_2
13 2 ccc 0 2 text_2
14 2 ddd 0 9 text_2
我认为我必须使用extract
的某种组合,但我迷路了。
答案 0 :(得分:3)
如果它最初是JSON(如@neilfws建议),则使用jsonlite
包之类的内容进行解析会更容易。如果没有,请尝试以下操作:
library(tidyr)
library(dplyr)
library(stringr)
df %>%
gather(level, 'val', text_1, text_2) %>%
separate(val, into = paste0('val', 1:(max(str_count(.$val,"\\},"))+1)), "\\},") %>%
gather(val, ugly_text, starts_with('val')) %>%
select(-val) %>%
filter(!is.na(ugly_text)) %>%
mutate(product = str_match(ugly_text, ':name=>\\"(.*?)\\"')[, 2],
priority = str_match(ugly_text , ':priority=>([0-9])')[, 2],
count = str_match(ugly_text , ':count=>([0-9])')[, 2]) %>%
select(id, product, priority, count, level) %>%
full_join(distinct(expand.grid(id = .$id, product = .$product, level = .$level)),
by = c('id', 'product', 'level')) %>%
mutate_at(vars(priority, count), ~if_else(is.na(.x), 0, as.numeric(.x))) %>%
arrange(level, id, product)
# A tibble: 16 x 5
id product priority count level
<int> <chr> <dbl> <dbl> <chr>
1 1 aaa 0 4 text_1
2 1 bbb 0 2 text_1
3 1 ccc 0 0 text_1
4 1 ddd 0 0 text_1
5 2 aaa 0 5 text_1
6 2 bbb 0 0 text_1
7 2 ccc 0 3 text_1
8 2 ddd 0 0 text_1
9 1 aaa 0 0 text_2
10 1 bbb 0 4 text_2
11 1 ccc 0 0 text_2
12 1 ddd 0 2 text_2
13 2 aaa 0 0 text_2
14 2 bbb 0 4 text_2
15 2 ccc 0 2 text_2
16 2 ddd 0 9 text_2
我不确定您写“产品数量封闭,因此需要将所有组合填充”时的意思-但这至少应该是一个开始。
答案 1 :(得分:1)
这是我尝试使用tidyverse
包清洁数据集的尝试。它与所需的输出不同。这是因为您的输出在某些组中包含诸如ccc
或ddd
之类的组合。但是,我无法弄清楚规则,例如为什么text_2
有aaa
,但是text_1
没有ddd
。因此,除非有新信息,否则我决定保持原样。
library(tidyverse)
dat2 <- dat %>%
gather(level, text, starts_with("text_")) %>%
separate_rows(text, sep = ", ") %>%
mutate(text = map(text, ~str_split(.x, pattern = "=>"))) %>%
mutate(text = map(text, function(x){
len <- length(x[[1]])
y <- x[[1]][c(len - 1, len)]
return(y)
})) %>%
mutate(text = map(text, ~str_replace_all(.x, "[:punct:]", ""))) %>%
mutate(Column = map_chr(text, 1), Value = map_chr(text, 2)) %>%
mutate_at(vars(Column, Value), funs(str_trim(.))) %>%
select(-text) %>%
mutate(Group = cumsum(Column %in% "name")) %>%
spread(Column, Value) %>%
select(id, product = name, priority, count, level) %>%
arrange(level, id, product)
dat2
# # A tibble: 9 x 5
# id product priority count level
# <int> <chr> <chr> <chr> <chr>
# 1 1 aaa 0 4 text_1
# 2 1 bbb 0 2 text_1
# 3 2 aaa 0 5 text_1
# 4 2 ccc 0 3 text_1
# 5 1 bbb 0 4 text_2
# 6 1 ddd 0 2 text_2
# 7 2 bbb 0 4 text_2
# 8 2 ccc 0 2 text_2
# 9 2 ddd 0 9 text_2
我尝试添加一个complete
调用以扩展所有组合的数据框。但是,现在输出的行比所需的输出多,因为ddd
现在与text_1
一起使用。同样,不清楚创建所需输出的规则是什么。
library(tidyverse)
dat2 <- dat %>%
gather(level, text, starts_with("text_")) %>%
separate_rows(text, sep = ", ") %>%
mutate(text = map(text, ~str_split(.x, pattern = "=>"))) %>%
mutate(text = map(text, function(x){
len <- length(x[[1]])
y <- x[[1]][c(len - 1, len)]
return(y)
})) %>%
mutate(text = map(text, ~str_replace_all(.x, "[:punct:]", ""))) %>%
mutate(Column = map_chr(text, 1), Value = map_chr(text, 2)) %>%
mutate_at(vars(Column, Value), funs(str_trim(.))) %>%
select(-text) %>%
mutate(Group = cumsum(Column %in% "name")) %>%
spread(Column, Value) %>%
complete(name, id, level, fill = list(priority = 0, count = 0)) %>%
select(id, product = name, priority, count, level) %>%
arrange(level, id, product)
dat2
# # A tibble: 16 x 5
# id product priority count level
# <int> <chr> <chr> <chr> <chr>
# 1 1 aaa 0 4 text_1
# 2 1 bbb 0 2 text_1
# 3 1 ccc 0 0 text_1
# 4 1 ddd 0 0 text_1
# 5 2 aaa 0 5 text_1
# 6 2 bbb 0 0 text_1
# 7 2 ccc 0 3 text_1
# 8 2 ddd 0 0 text_1
# 9 1 aaa 0 0 text_2
#10 1 bbb 0 4 text_2
#11 1 ccc 0 0 text_2
#12 1 ddd 0 2 text_2
#13 2 aaa 0 0 text_2
#14 2 bbb 0 4 text_2
#15 2 ccc 0 2 text_2
#16 2 ddd 0 9 text_2