简化并总结R中的数据表

时间:2014-11-20 04:28:34

标签: r

我的一个数据集如下

name  alias (list of alias)
x     c("R","V","Q")
y     "Z"
q     c("A", "R", "M")
w     c("C","A","R")

我想首先简化表格如下

name alias 
x  "R"
x  "V"
x  "Q"
y  "Z"
q  "A"
q  "R"
q  "M"
w  "C"
w  "A"
w  "R"

以后修改数据以获取

alias name 
"R"   c(x,q,w)
"V"   x
"Q"   x
"Z"   y
"A"  c(q,w)
"M"  q
"C"  w

我怎样才能在R中实现这一目标?

这是实际的数据集

> \dput(head(cases))
structure(list(caseid = c(7703415, 7758128, 7858259, 8802954, 
8829620, 8847200), tcount = c(2L, 2L, 3L, 10L, 4L, 2L), helplinks = c("character(0", 
"c(\"60107\", \"56085\", \"57587\", \"3000020\"", "character(0", 
"character(0", "c(\"60107\", \"3000023\", \"3000020\", \"60107\", \"56085\", \"57587\"", 
"character(0")), .Names = c("caseid", "tcount", "helplinks"), row.names = c(NA, 
6L), class = "data.frame")

> head(cases)
   caseid tcount                                                  helplinks
1 7703415      2                                                character(0
2 7758128      2                     c("60107", "56085", "57587", "3000020"
3 7858259      3                                                character(0
4 8802954     10                                                character(0
5 8829620      4 c("60107", "3000023", "3000020", "60107", "56085", "57587"
6 8847200      2                                                character(0

2 个答案:

答案 0 :(得分:4)

新答案

使用我的" splistackshape"中的cSplit包:

cSplit(cases, "helplinks", ",", "long")[, helplinks := gsub(
  'character\\(0|c\\(|\\"', "", helplinks)][, list(
    caseid = list(caseid)), by = helplinks]
#    helplinks                          caseid
# 1:           7703415,7858259,8802954,8847200
# 2:     60107         7758128,8829620,8829620
# 3:     56085                 7758128,8829620
# 4:     57587                 7758128,8829620
# 5:   3000020                 7758128,8829620
# 6:   3000023                         8829620

旧答案

我假设您从这样的事情开始:

df <- data.frame(
  name = c("x", "y", "q", "w"),
  alias = I(list(c("R","V","Q"), "Z", c("A", "R", "M"), c("C","A","R")))
)
df
#   name   alias
# 1    x R, V, Q
# 2    y       Z
# 3    q A, R, M
# 4    w C, A, R

如果是这种情况,可以采用我的&#34; splitstackshape&#34;中的listCol_l方法。与&#34; data.table&#34;。

一起打包
library(splitstackshape)
listCol_l(df, "alias")[, list(name = list(name)), by = alias_ul]
#    alias_ul  name
# 1:        R x,q,w
# 2:        V     x
# 3:        Q     x
# 4:        Z     y
# 5:        A   q,w
# 6:        M     q
# 7:        C     w

你真的不需要&#34; splitstackshape&#34;为此,如果你想删除我的答案的自我推销部分,只需使用&#34; data.table&#34;,你可以这样做:

library(data.table)
as.data.table(df)[, list(
  alias = unlist(alias)), by = name][, list(
  name = list(name)), by = alias]

答案 1 :(得分:2)

首先我们清理"character(0"。然后我们读入曾经列出的那些字符值,但现在需要scan - ned。然后我们应用一个函数,从每一行创建一个数据帧:

good.case <- cases[ grepl("c\\(", cases$helplinks),]
 lapply( split(good.case, row.names(good.case) ), function(d){
   vec <- scan(text=gsub("c\\(|[, ]", "", d$helplinks) ,what="")
   do.call( data.frame, list(caseid=d$caseid, alias=vec) )
 }
 )
#-------
#Read 4 items
#Read 6 items
$`2`
   caseid   alias
1 7758128   60107
2 7758128   56085
3 7758128   57587
4 7758128 3000020

$`5`
   caseid   alias
1 8829620   60107
2 8829620 3000023
3 8829620 3000020
4 8829620   60107
5 8829620   56085
6 8829620   57587

 expanded <- lapply( split(good.case, row.names(good.case) ), function(d){
    vec <- scan(text=gsub("c\\(|[, ]", "", d$helplinks) ,what="")
    do.call( data.frame, list(caseid=rep(d$caseid, length(vec)), alias=vec) )
  }
  )
#Read 4 items
#Read 6 items

现在我们将数据帧绑定在一起:

 do.call(rbind, expanded)
#---------------
     caseid   alias
2.1 7758128   60107
2.2 7758128   56085
2.3 7758128   57587
2.4 7758128 3000020
5.1 8829620   60107
5.2 8829620 3000023
5.3 8829620 3000020
5.4 8829620   60107
5.5 8829620   56085
5.6 8829620   57587

但我认为只有一半的方式。没有必要进一步追求Ananda在那里的5个问候答案。