转换复杂的数据框架

时间:2017-08-18 16:09:12

标签: r dataframe

我有一个数据框,其第一列(weights)包含一个列表(数据帧?):

> head(data$weights)

> data <- structure(list(A373R11 = structure(list(Signature.1A = 0, Signature.1B = 0, 
    Signature.2 = 0, Signature.3 = 0.151631702143023, Signature.4 = 0.149799882118262, 
    Signature.5 = 0, Signature.6 = 0, Signature.7 = 0.0634912587993959, 
    Signature.8 = 0, Signature.9 = 0.173189155080817, Signature.10 = 0, 
    Signature.11 = 0, Signature.12 = 0, Signature.13 = 0, Signature.14 = 0, 
    Signature.15 = 0, Signature.16 = 0, Signature.17 = 0, Signature.18 = 0, 
    Signature.19 = 0, Signature.20 = 0, Signature.21 = 0.0905517653558877, 
    Signature.R1 = 0, Signature.R2 = 0, Signature.R3 = 0, Signature.U1 = 0.155590748898003, 
    Signature.U2 = 0.145955461287919), .Names = c("Signature.1A", 
"Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
"Signature.5", "Signature.6", "Signature.7", "Signature.8", "Signature.9", 
"Signature.10", "Signature.11", "Signature.12", "Signature.13", 
"Signature.14", "Signature.15", "Signature.16", "Signature.17", 
"Signature.18", "Signature.19", "Signature.20", "Signature.21", 
"Signature.R1", "Signature.R2", "Signature.R3", "Signature.U1", 
"Signature.U2"), row.names = "A373R11", class = "data.frame"), 
    A373R13 = structure(list(Signature.1A = 0, Signature.1B = 0, 
        Signature.2 = 0, Signature.3 = 0.221014874027829, Signature.4 = 0, 
        Signature.5 = 0, Signature.6 = 0, Signature.7 = 0, Signature.8 = 0.279252211893692, 
        Signature.9 = 0, Signature.10 = 0, Signature.11 = 0, 
        Signature.12 = 0, Signature.13 = 0, Signature.14 = 0, 
        Signature.15 = 0, Signature.16 = 0, Signature.17 = 0, 
        Signature.18 = 0, Signature.19 = 0.115216422668955, Signature.20 = 0, 
        Signature.21 = 0, Signature.R1 = 0, Signature.R2 = 0, 
        Signature.R3 = 0.0636987713225648, Signature.U1 = 0.108875099907467, 
        Signature.U2 = 0), .Names = c("Signature.1A", "Signature.1B", 
    "Signature.2", "Signature.3", "Signature.4", "Signature.5", 
    "Signature.6", "Signature.7", "Signature.8", "Signature.9", 
    "Signature.10", "Signature.11", "Signature.12", "Signature.13", 
    "Signature.14", "Signature.15", "Signature.16", "Signature.17", 
    "Signature.18", "Signature.19", "Signature.20", "Signature.21", 
    "Signature.R1", "Signature.R2", "Signature.R3", "Signature.U1", 
    "Signature.U2"), row.names = "A373R13", class = "data.frame"), 
    A373R3 = structure(list(Signature.1A = 0, Signature.1B = 0, 
        Signature.2 = 0, Signature.3 = 0.0795605471131758, Signature.4 = 0.0973130562439999, 
        Signature.5 = 0, Signature.6 = 0, Signature.7 = 0, Signature.8 = 0.249674548796242, 
        Signature.9 = 0.0725013504411567, Signature.10 = 0, Signature.11 = 0.064665155855146, 
        Signature.12 = 0, Signature.13 = 0, Signature.14 = 0, 
        Signature.15 = 0, Signature.16 = 0, Signature.17 = 0, 
        Signature.18 = 0, Signature.19 = 0, Signature.20 = 0, 
        Signature.21 = 0, Signature.R1 = 0, Signature.R2 = 0, 
        Signature.R3 = 0.0703546703126821, Signature.U1 = 0.21753544296676, 
        Signature.U2 = 0.0739201832004727), .Names = c("Signature.1A", 
    "Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
    "Signature.5", "Signature.6", "Signature.7", "Signature.8", 
    "Signature.9", "Signature.10", "Signature.11", "Signature.12", 
    "Signature.13", "Signature.14", "Signature.15", "Signature.16", 
    "Signature.17", "Signature.18", "Signature.19", "Signature.20", 
    "Signature.21", "Signature.R1", "Signature.R2", "Signature.R3", 
    "Signature.U1", "Signature.U2"), row.names = "A373R3", class = "data.frame"), 
    A373R5 = structure(list(Signature.1A = 0, Signature.1B = 0, 
        Signature.2 = 0, Signature.3 = 0.113996509522102, Signature.4 = 0.114874220936966, 
        Signature.5 = 0.142056872670519, Signature.6 = 0, Signature.7 = 0, 
        Signature.8 = 0.208376707959741, Signature.9 = 0.0744527503782136, 
        Signature.10 = 0, Signature.11 = 0, Signature.12 = 0, 
        Signature.13 = 0, Signature.14 = 0, Signature.15 = 0.0771902641012979, 
        Signature.16 = 0, Signature.17 = 0, Signature.18 = 0, 
        Signature.19 = 0, Signature.20 = 0, Signature.21 = 0, 
        Signature.R1 = 0, Signature.R2 = 0, Signature.R3 = 0, 
        Signature.U1 = 0.0673567355607731, Signature.U2 = 0), .Names = c("Signature.1A", 
    "Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
    "Signature.5", "Signature.6", "Signature.7", "Signature.8", 
    "Signature.9", "Signature.10", "Signature.11", "Signature.12", 
    "Signature.13", "Signature.14", "Signature.15", "Signature.16", 
    "Signature.17", "Signature.18", "Signature.19", "Signature.20", 
    "Signature.21", "Signature.R1", "Signature.R2", "Signature.R3", 
    "Signature.U1", "Signature.U2"), row.names = "A373R5", class = "data.frame"), 
    A373R9 = structure(list(Signature.1A = 0, Signature.1B = 0, 
        Signature.2 = 0, Signature.3 = 0.116847300193985, Signature.4 = 0, 
        Signature.5 = 0.21624751052703, Signature.6 = 0, Signature.7 = 0, 
        Signature.8 = 0.252498230882402, Signature.9 = 0, Signature.10 = 0, 
        Signature.11 = 0.119495912880994, Signature.12 = 0, Signature.13 = 0, 
        Signature.14 = 0, Signature.15 = 0, Signature.16 = 0, 
        Signature.17 = 0, Signature.18 = 0, Signature.19 = 0, 
        Signature.20 = 0, Signature.21 = 0, Signature.R1 = 0, 
        Signature.R2 = 0, Signature.R3 = 0.0725549911220892, 
        Signature.U1 = 0, Signature.U2 = 0), .Names = c("Signature.1A", 
    "Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
    "Signature.5", "Signature.6", "Signature.7", "Signature.8", 
    "Signature.9", "Signature.10", "Signature.11", "Signature.12", 
    "Signature.13", "Signature.14", "Signature.15", "Signature.16", 
    "Signature.17", "Signature.18", "Signature.19", "Signature.20", 
    "Signature.21", "Signature.R1", "Signature.R2", "Signature.R3", 
    "Signature.U1", "Signature.U2"), row.names = "A373R9", class = "data.frame"), 
    A512R19 = structure(list(Signature.1A = 0.109490572493859, 
        Signature.1B = 0, Signature.2 = 0, Signature.3 = 0, Signature.4 = 0.22010156823306, 
        Signature.5 = 0, Signature.6 = 0, Signature.7 = 0, Signature.8 = 0, 
        Signature.9 = 0, Signature.10 = 0, Signature.11 = 0, 
        Signature.12 = 0, Signature.13 = 0, Signature.14 = 0, 
        Signature.15 = 0, Signature.16 = 0, Signature.17 = 0, 
        Signature.18 = 0, Signature.19 = 0, Signature.20 = 0, 
        Signature.21 = 0, Signature.R1 = 0, Signature.R2 = 0, 
        Signature.R3 = 0.150943894106973, Signature.U1 = 0.248556502648564, 
        Signature.U2 = 0.119306892617062), .Names = c("Signature.1A", 
    "Signature.1B", "Signature.2", "Signature.3", "Signature.4", 
    "Signature.5", "Signature.6", "Signature.7", "Signature.8", 
    "Signature.9", "Signature.10", "Signature.11", "Signature.12", 
    "Signature.13", "Signature.14", "Signature.15", "Signature.16", 
    "Signature.17", "Signature.18", "Signature.19", "Signature.20", 
    "Signature.21", "Signature.R1", "Signature.R2", "Signature.R3", 
    "Signature.U1", "Signature.U2"), row.names = "A512R19", class = "data.frame")), .Names = c("A373R11", 
"A373R13", "A373R3", "A373R5", "A373R9", "A512R19"))

此处,每行包含一个样本,每列包含特定签名的分数:

> data[1]

$A373R11
        Signature.1A Signature.1B Signature.2 Signature.3 Signature.4 Signature.5 Signature.6 Signature.7 Signature.8 Signature.9 Signature.10 Signature.11
A373R11            0            0           0   0.1516317   0.1497999           0           0  0.06349126           0   0.1731892            0            0
        Signature.12 Signature.13 Signature.14 Signature.15 Signature.16 Signature.17 Signature.18 Signature.19 Signature.20 Signature.21 Signature.R1 Signature.R2
A373R11            0            0            0            0            0            0            0            0            0   0.09055177            0            0
        Signature.R3 Signature.U1 Signature.U2
A373R11            0    0.1555907    0.1459555

我想将其转换为具有以下结构的数据框:

sample  signature  score
A373R11  Signature.1A  0
A373R11  Signature.1B  0
[...]
A373R13  Signature.1A  0
A373R13  Signature.1B  0
[...]

有人能指出我正确的方向吗?

4 个答案:

答案 0 :(得分:9)

两种方法:

1)使用data.table-package

使用:

library(data.table)
melt(rbindlist(data, idcol = 'sample'),
     id = 'sample', variable.name = 'signature', value.name = 'score')

给出:

      sample    signature      score
  1: A373R11 Signature.1A 0.00000000
  2: A373R13 Signature.1A 0.00000000
  3:  A373R3 Signature.1A 0.00000000
  4:  A373R5 Signature.1A 0.00000000
  5:  A373R9 Signature.1A 0.00000000
 ---                                
158: A373R13 Signature.U2 0.00000000
159:  A373R3 Signature.U2 0.07392018
160:  A373R5 Signature.U2 0.00000000
161:  A373R9 Signature.U2 0.00000000
162: A512R19 Signature.U2 0.11930689

2)基础R

使用:

dat2 <- do.call(rbind, dat)
reshape(dat2, idvar = 'sample', ids = row.names(dat2),
        varying = list(1:ncol(dat2)), times = colnames(dat2),
        timevar = 'signature', v.names = 'score',
        new.row.names = NULL, direction = 'long')

给出:

                        signature      score  sample
A373R11.Signature.1A Signature.1A 0.00000000 A373R11
A373R13.Signature.1A Signature.1A 0.00000000 A373R13
A373R3.Signature.1A  Signature.1A 0.00000000  A373R3
A373R5.Signature.1A  Signature.1A 0.00000000  A373R5
A373R9.Signature.1A  Signature.1A 0.00000000  A373R9

.....

A373R13.Signature.U2 Signature.U2 0.00000000 A373R13
A373R3.Signature.U2  Signature.U2 0.07392018  A373R3
A373R5.Signature.U2  Signature.U2 0.00000000  A373R5
A373R9.Signature.U2  Signature.U2 0.00000000  A373R9
A512R19.Signature.U2 Signature.U2 0.11930689 A512R19

注:

最好不要为数据提供与函数相同的名称。请参阅?data

答案 1 :(得分:7)

if (Number($('#field').val()) <= 5) { alert('Number should not be more than 5'); }解决方案,我们首先将所有data.frames连接在一起,然后使用tidyverse根据需要重新整形它们:

gather

给出:

library(dplyr)
library(tidyr)

data %>%
  bind_rows(.id = 'sample') %>%
  gather(signature, score, -sample)

可以写成没有管道的单行代码:

     sample    signature      score
1   A373R11 Signature.1A 0.00000000
2   A373R13 Signature.1A 0.00000000
3    A373R3 Signature.1A 0.00000000
4    A373R5 Signature.1A 0.00000000
5    A373R9 Signature.1A 0.00000000
6   A512R19 Signature.1A 0.10949057
7   A373R11 Signature.1B 0.00000000
8   A373R13 Signature.1B 0.00000000
9    A373R3 Signature.1B 0.00000000
10   A373R5 Signature.1B 0.00000000
....

答案 2 :(得分:7)

以下是rapply的基础R替代方案。请注意,我已重命名您的data.frame dat。

# pull out the values and their attached names with rapply
myVec <- rapply(unname(dat), identity)
# even better:
# myVec <- unlist(unname(dat))

# construct the data.frame
mydf <- data.frame(sample=rep(names(dat), lengths(dat)),
                   signature=names(myVec),
                   score=myVec,
                   stringsAsFactors=FALSE, row.names = seq_along(myVec))

返回

 head(mydf)
   sample    signature     score
1 A373R11 Signature.1A 0.0000000
2 A373R11 Signature.1B 0.0000000
3 A373R11  Signature.2 0.0000000
4 A373R11  Signature.3 0.1516317
5 A373R11  Signature.4 0.1497999
6 A373R11  Signature.5 0.0000000

默认情况下,rapply对每个列表的最终元素执行一个函数,返回一个向量。我使用identity来返回这些元素。由于每个元素都有一个与之关联的名称,rapply将返回一个命名向量。

我使用unname去除每个外部列表项的名称。这使得下一步的数据构建变得更加容易。否则,命名向量将具有类似“A373R11.Signature.5”的名称,这将需要更多的工作来返回所需的结果。

答案 3 :(得分:4)

首先,您将data.frames(A=[] [A.extend(item)for item in df.values.tolist() ] A = list(set([i for i in A if i is not None])) A Out[1224]: ['a', 'b', 'c', 'd', 'z'] )组合在一起,然后将它们转换为矩阵以保留rownames并将其融合。

rbind

带管道的清洁解决方案

library(data.table)    
res <- melt(as.matrix(do.call(rbind, data)))
colnames(res) <- c("sample", "signature", "score")