我的一个数据集如下
name alias (list of alias)
x c("R","V","Q")
y "Z"
q c("A", "R", "M")
w c("C","A","R")
我想首先简化表格如下
name alias
x "R"
x "V"
x "Q"
y "Z"
q "A"
q "R"
q "M"
w "C"
w "A"
w "R"
以后修改数据以获取
alias name
"R" c(x,q,w)
"V" x
"Q" x
"Z" y
"A" c(q,w)
"M" q
"C" w
我怎样才能在R中实现这一目标?
这是实际的数据集
> \dput(head(cases))
structure(list(caseid = c(7703415, 7758128, 7858259, 8802954,
8829620, 8847200), tcount = c(2L, 2L, 3L, 10L, 4L, 2L), helplinks = c("character(0",
"c(\"60107\", \"56085\", \"57587\", \"3000020\"", "character(0",
"character(0", "c(\"60107\", \"3000023\", \"3000020\", \"60107\", \"56085\", \"57587\"",
"character(0")), .Names = c("caseid", "tcount", "helplinks"), row.names = c(NA,
6L), class = "data.frame")
> head(cases)
caseid tcount helplinks
1 7703415 2 character(0
2 7758128 2 c("60107", "56085", "57587", "3000020"
3 7858259 3 character(0
4 8802954 10 character(0
5 8829620 4 c("60107", "3000023", "3000020", "60107", "56085", "57587"
6 8847200 2 character(0
答案 0 :(得分:4)
使用我的" splistackshape"中的cSplit
包:
cSplit(cases, "helplinks", ",", "long")[, helplinks := gsub(
'character\\(0|c\\(|\\"', "", helplinks)][, list(
caseid = list(caseid)), by = helplinks]
# helplinks caseid
# 1: 7703415,7858259,8802954,8847200
# 2: 60107 7758128,8829620,8829620
# 3: 56085 7758128,8829620
# 4: 57587 7758128,8829620
# 5: 3000020 7758128,8829620
# 6: 3000023 8829620
我假设您从这样的事情开始:
df <- data.frame(
name = c("x", "y", "q", "w"),
alias = I(list(c("R","V","Q"), "Z", c("A", "R", "M"), c("C","A","R")))
)
df
# name alias
# 1 x R, V, Q
# 2 y Z
# 3 q A, R, M
# 4 w C, A, R
如果是这种情况,可以采用我的&#34; splitstackshape&#34;中的listCol_l
方法。与&#34; data.table&#34;。
library(splitstackshape)
listCol_l(df, "alias")[, list(name = list(name)), by = alias_ul]
# alias_ul name
# 1: R x,q,w
# 2: V x
# 3: Q x
# 4: Z y
# 5: A q,w
# 6: M q
# 7: C w
你真的不需要&#34; splitstackshape&#34;为此,如果你想删除我的答案的自我推销部分,只需使用&#34; data.table&#34;,你可以这样做:
library(data.table)
as.data.table(df)[, list(
alias = unlist(alias)), by = name][, list(
name = list(name)), by = alias]
答案 1 :(得分:2)
首先我们清理"character(0"
。然后我们读入曾经列出的那些字符值,但现在需要scan
- ned。然后我们应用一个函数,从每一行创建一个数据帧:
good.case <- cases[ grepl("c\\(", cases$helplinks),]
lapply( split(good.case, row.names(good.case) ), function(d){
vec <- scan(text=gsub("c\\(|[, ]", "", d$helplinks) ,what="")
do.call( data.frame, list(caseid=d$caseid, alias=vec) )
}
)
#-------
#Read 4 items
#Read 6 items
$`2`
caseid alias
1 7758128 60107
2 7758128 56085
3 7758128 57587
4 7758128 3000020
$`5`
caseid alias
1 8829620 60107
2 8829620 3000023
3 8829620 3000020
4 8829620 60107
5 8829620 56085
6 8829620 57587
expanded <- lapply( split(good.case, row.names(good.case) ), function(d){
vec <- scan(text=gsub("c\\(|[, ]", "", d$helplinks) ,what="")
do.call( data.frame, list(caseid=rep(d$caseid, length(vec)), alias=vec) )
}
)
#Read 4 items
#Read 6 items
现在我们将数据帧绑定在一起:
do.call(rbind, expanded)
#---------------
caseid alias
2.1 7758128 60107
2.2 7758128 56085
2.3 7758128 57587
2.4 7758128 3000020
5.1 8829620 60107
5.2 8829620 3000023
5.3 8829620 3000020
5.4 8829620 60107
5.5 8829620 56085
5.6 8829620 57587
但我认为只有一半的方式。没有必要进一步追求Ananda在那里的5个问候答案。