数据框,按列值拆分并放入新列

时间:2014-10-27 00:56:57

标签: r split dataframe apply

我是R的新手,目前正在设置我的数据。 我的数据采用的格式是每行包含一个单独的测量值(DV),以及一个包含测量类型(DVID)说明的列。

以下是我的数据示例:

ID  TIME    DV  DVID
1   0   0.0 7
1   1   27.5    1
1   1   0.0 7
1   4   19.6    1
1   4   0.0 7
1   8   17.9    1
1   8   0.0 7
1   12  17.7    1
1   12  0.0 7
1   24  19.6    1
1   24  0.0 7
1   48  32.9    1
1   48  0.0 7
2   0   0.0 7
2   1   0.0 7
2   4   0.0 7
2   8   0.0 7
2   12  0.0 7
2   24  0.0 7
2   48  27.3    1
2   72  30.9    1
2   72  0.0 7
2   96  20.8    1
3   0   1.0 7
3   1   7.0 1
3   1   0.0 7
3   4   15.0    1
3   4   0.0 7
3   8   27.2    1
3   8   0.0 7
3   12  0.0 7
3   24  47.0    1
3   24  0.0 7
3   48  65.4    1
3   48  0.0 7
3   72  68.7    1
3   72  0.0 7
3   96  82.8    1
3   96  0.0 7
3   120 70.5    1

我想做的是"配对"不同类型的测量,所以我有一列测量是一种类型(DVID = 1),另一列测量是另一种类型(DVID = 7)。 我还需要删除我没有测量类型的测量值(或者,在这些字段中放入NA) 这方面的一个例子如下:

ID  TIME    DV_1    DV_7
1   1   27.5    0
1   4   19.6    0
1   8   17.9    0
1   12  17.7    0
1   24  19.6    0
1   48  32.9    0

目的是我希望能够根据DVID = 1值绘制DVID = 7值。 这里有人可以帮我这么做吗? 我现在可能不得不在拆分和应用系列中使用函数,但我不知道从哪里开始。

提前致谢!

3 个答案:

答案 0 :(得分:4)

这是一种方法。

library(dplyr)
library(tidyr)

#Create one column for group1 and another for group7 in DVID
ana <- spread(foo, DVID, DV)

colnames(ana) <- c("ID", "TIME", "DV1", "DV7")

# Remove rows which have NA
filter(ana, !DV1 %in% NA & !DV7 %in% NA)

#   ID TIME  DV1 DV7
#1   1    1 27.5   0
#2   1    4 19.6   0
#3   1    8 17.9   0
#4   1   12 17.7   0
#5   1   24 19.6   0
#6   1   48 32.9   0
#7   2   72 30.9   0
#8   3    1  7.0   0
#9   3    4 15.0   0
#10  3    8 27.2   0
#11  3   24 47.0   0
#12  3   48 65.4   0
#13  3   72 68.7   0
#14  3   96 82.8   0

另一种方法是将数据框转换为data.table

setDT(foo)

bob <- dcast.data.table(foo, ID + TIME ~ DVID, value.var = "DV")

setnames(bob, c("1","7"), c("DV1", "DV7"))[!DV1 %in% NA & !DV7 %in% NA, ]

<强>更新

鉴于@ Arun的建议,第3行可以使用data.table 1.9.5

na.omit(bob, by=c("1", "7"))

答案 1 :(得分:2)

您似乎想要重塑数据。使用cast包中的reshape

library(reshape)

# read data
dfX = read.table(textConnection("ID  TIME   DV  DVID
1   0   0.0 7
1   1   27.5    1
               1    1   0.0 7
               1    4   19.6    1
               1    4   0.0 7
               1    8   17.9    1
               1    8   0.0 7
               1    12  17.7    1
               1    12  0.0 7
               1    24  19.6    1
               1    24  0.0 7
               1    48  32.9    1
               1    48  0.0 7
               2    0   0.0 7
               2    1   0.0 7
               2    4   0.0 7
               2    8   0.0 7
               2    12  0.0 7
               2    24  0.0 7
               2    48  27.3    1
               2    72  30.9    1
               2    72  0.0 7
               2    96  20.8    1
               3    0   1.0 7
               3    1   7.0 1
               3    1   0.0 7
               3    4   15.0    1
               3    4   0.0 7
               3    8   27.2    1
               3    8   0.0 7
               3    12  0.0 7
               3    24  47.0    1
               3    24  0.0 7
               3    48  65.4    1
               3    48  0.0 7
               3    72  68.7    1
               3    72  0.0 7
               3    96  82.8    1
               3    96  0.0 7
               3    120 70.5    1"), header = TRUE)

# reshape the data
reshape::cast(dfX, ID + TIME ~ DVID, value = "DV")

这是输出:

> reshape::cast(dfX, ID + TIME ~ DVID, value = "DV")
   ID TIME    1  7
1   1    0   NA  0
2   1    1 27.5  0
3   1    4 19.6  0
4   1    8 17.9  0
5   1   12 17.7  0
6   1   24 19.6  0
7   1   48 32.9  0
8   2    0   NA  0
9   2    1   NA  0
10  2    4   NA  0
11  2    8   NA  0
12  2   12   NA  0
13  2   24   NA  0
14  2   48 27.3 NA
15  2   72 30.9  0
16  2   96 20.8 NA
17  3    0   NA  1
18  3    1  7.0  0
19  3    4 15.0  0
20  3    8 27.2  0
21  3   12   NA  0
22  3   24 47.0  0
23  3   48 65.4  0
24  3   72 68.7  0
25  3   96 82.8  0
26  3  120 70.5 NA

答案 2 :(得分:1)

此外,您可以使用reshape

中的base R
 na.omit(reshape(df, idvar = c("ID","TIME"),
               timevar="DVID", direction = "wide"))[,c(1:2,4:3)]

 #    ID TIME DV.1 DV.7
 #2   1    1 27.5    0
 #4   1    4 19.6    0
 #6   1    8 17.9    0
 #8   1   12 17.7    0
 #10  1   24 19.6    0
 #12  1   48 32.9    0
 #21  2   72 30.9    0
 #25  3    1  7.0    0
 #27  3    4 15.0    0
 #29  3    8 27.2    0
 #32  3   24 47.0    0
 #34  3   48 65.4    0
 #36  3   72 68.7    0
 #38  3   96 82.8    0

数据

 df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), TIME = c(0L, 
 1L, 1L, 4L, 4L, 8L, 8L, 12L, 12L, 24L, 24L, 48L, 48L, 0L, 1L, 
 4L, 8L, 12L, 24L, 48L, 72L, 72L, 96L, 0L, 1L, 1L, 4L, 4L, 8L, 
 8L, 12L, 24L, 24L, 48L, 48L, 72L, 72L, 96L, 96L, 120L), DV = c(0, 
 27.5, 0, 19.6, 0, 17.9, 0, 17.7, 0, 19.6, 0, 32.9, 0, 0, 0, 0, 
 0, 0, 0, 27.3, 30.9, 0, 20.8, 1, 7, 0, 15, 0, 27.2, 0, 0, 47, 
 0, 65.4, 0, 68.7, 0, 82.8, 0, 70.5), DVID = c(7L, 1L, 7L, 1L, 
 7L, 1L, 7L, 1L, 7L, 1L, 7L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 1L, 
 1L, 7L, 1L, 7L, 1L, 7L, 1L, 7L, 1L, 7L, 7L, 1L, 7L, 1L, 7L, 1L, 
7L, 1L, 7L, 1L)), .Names = c("ID", "TIME", "DV", "DVID"), class = "data.frame", row.names = c(NA, 
-40L))