R子集基于3列的组合并排除重复组合的数据帧

时间:2015-11-11 08:03:55

标签: r data.table

我有一个看起来像的数据集:

Experiment  Sequence    Parameter   Time
Exp1    AAAA    2   10
Exp2    AAAA    2   11
Exp3    AAAA    2   12
Exp1    BBBB    2   13.1
Exp1    BBBB    3   13.2
Exp1    BBBB    4   13.3
Exp2    BBBB    2   14.1
Exp2    BBBB    3   14.2
Exp3    BBBB    2   16.3
Exp3    BBBB    3   16.4
Exp3    BBBB    4   16.5
Exp3    BBBB    5   16.6
Exp1    CCCC    2   20
Exp2    CCCC    2   22.2
Exp1    DDDD    3   22.3
Exp1    DDDD    2   22.4
Exp2    DDDD    3   25.2
Exp2    DDDD    2   25.3
Exp3    DDDD    3   27
Exp1    EEEE    2   28
Exp2    EEEE    3   29
Exp3    EEEE    4   30
Exp1    FFFF    2   33.2
Exp1    FFFF    3   33.4
Exp1    FFFF    4   33.6
Exp2    FFFF    2   35.1
Exp2    FFFF    3   35.2
Exp1    GGGG    2   40.1
Exp1    GGGG    2   40.2
Exp1    GGGG    2   40.3
Exp1    GGGG    2   42
Exp2    GGGG    2   42.3
Exp2    GGGG    2   44.3
Exp3    GGGG    2   45.3
Exp3    GGGG    2   45.4

真实数据集包含:

  • 许多实验
  • 许多序列,可以存在于一个或多个实验中
  • 每个序列可以有3-4个不同的参数

对于我的分析(当时),我首先需要根据前3列的组合来对数据帧进行子集化:实验,序列和参数。规则是:

  • 序列+参数组合应存在于数据帧的所有实验中。如果没有,他们应该被排除在外。 在示例中:BBBB + 4,BBBB + 5,CCCC + 2,...应该消失,BBBB + 2,DDDD + 3,....可以留下
  • 如果在同一实验中多次出现序列+参数组合,则也应排除这些组合。 在示例中:GGGG + 2

因此,在子集化之后,示例中的数据帧应该变为这样:

Experiment  Sequence    Parameter   Time
Exp1    AAAA    2   10
Exp2    AAAA    2   11
Exp3    AAAA    2   12

Exp1    BBBB    2   13.1
Exp2    BBBB    2   14.1
Exp3    BBBB    2   16.3

Exp1    BBBB    3   13.2
Exp2    BBBB    3   14.2
Exp3    BBBB    3   16.4

Exp1    DDDD    3   22.3
Exp2    DDDD    3   25.2
Exp3    DDDD    3   27

有人能帮助我吗? 谢谢!

Experiment <- c("Exp1", "Exp2", "Exp3", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp3", "Exp3", "Exp3", "Exp1", "Exp2", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp1", "Exp2", "Exp3", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp1", "Exp1", "Exp1",   "Exp1", "Exp2", "Exp2", "Exp3", "Exp3")
Sequence <- c("AAAA", "AAAA", "AAAA", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB","BBBB", "BBBB", "CCCC", "CCCC", "DDDD", "DDDD", "DDDD", "DDDD", "DDDD", "EEEE", "EEEE", "EEEE", "FFFF", "FFFF", "FFFF", "FFFF", "FFFF", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG")
Parameter <- c("2", "2", "2", "2", "3", "4", "2", "3", "2", "3", "4", "5", "2", "2", "3", "2", "3", "2", "3", "2", "3", "4", "2", "3", "4", "2", "3", "2", "2", "2", "2", "2", "2", "2", "2")
Time <- c(10.0, 11.0, 12.0, 13.1, 13.2, 13.3, 14.1, 14.2, 16.3, 16.4, 16.5, 16.6, 20.0, 22.2, 22.3, 22.4, 25.2, 25.3, 27.0, 28.0, 29.0, 30.0, 33.2, 33.4, 33.6, 35.1, 35.2, 40.1, 40.2, 40.3, 42.0, 42.3, 44.3, 45.3, 45.4)
df <- data.frame(Experiment, Sequence, Parameter, Time)

3 个答案:

答案 0 :(得分:3)

一个选项是data.table。我们将'data.frame'转换为'data.table'(setDT(df),按'序列','参数',if分组'实验'中unique元素的频率为3,我们Subset the Data.table(.SD),然后,如果nrow等于1(.N==1),我们按'Experiment','Sequence'和'Parameter'进行分组我们Subset the Data.table(.SD)。

library(data.table)
setDT(df)[, if(uniqueN(Experiment)==3) .SD, by = .(Sequence, Parameter)
        ][,if(.N ==1) .SD , by = .(Experiment,Sequence, Parameter)]
#      Experiment Sequence Parameter Time
# 1:       Exp1     AAAA         2 10.0
# 2:       Exp2     AAAA         2 11.0
# 3:       Exp3     AAAA         2 12.0
# 4:       Exp1     BBBB         2 13.1
# 5:       Exp2     BBBB         2 14.1
# 6:       Exp3     BBBB         2 16.3
# 7:       Exp1     BBBB         3 13.2
# 8:       Exp2     BBBB         3 14.2
# 9:       Exp3     BBBB         3 16.4
#10:       Exp1     DDDD         3 22.3
#11:       Exp2     DDDD         3 25.2
#12:       Exp3     DDDD         3 27.0

答案 1 :(得分:2)

这可以(我的数据框的名称是d):

e <- length(unique(d$Experiment))
a <- aggregate(Experiment ~ Sequence+Parameter, data=d, FUN=function(x) length(unique(x)))
s1 <- merge(subset(a, Experiment==e)[1:2], d)
s1$n <- ave(s1$Parameter, FUN=length, s1$Sequence, s1$Parameter, s1$Experiment)
s2 <- subset(s1, n==1)

s1满足第一个条件,s2满足第二个条件。

s2的输出是:

> s2
   Sequence Parameter Experiment Time n
1      AAAA         2       Exp1 10.0 1
2      AAAA         2       Exp3 12.0 1
3      AAAA         2       Exp2 11.0 1
4      BBBB         2       Exp3 16.3 1
5      BBBB         2       Exp2 14.1 1
6      BBBB         2       Exp1 13.1 1
7      BBBB         3       Exp1 13.2 1
8      BBBB         3       Exp2 14.2 1
9      BBBB         3       Exp3 16.4 1
10     DDDD         3       Exp1 22.3 1
11     DDDD         3       Exp2 25.2 1
12     DDDD         3       Exp3 27.0 1

以下是完整的代码:

d <- read.table(header=TRUE, text='Experiment  Sequence    Parameter   Time
Exp1    AAAA    2   10
Exp2    AAAA    2   11
Exp3    AAAA    2   12
Exp1    BBBB    2   13.1
Exp1    BBBB    3   13.2
Exp1    BBBB    4   13.3
Exp2    BBBB    2   14.1
Exp2    BBBB    3   14.2
Exp3    BBBB    2   16.3
Exp3    BBBB    3   16.4
Exp3    BBBB    4   16.5
Exp3    BBBB    5   16.6
Exp1    CCCC    2   20
Exp2    CCCC    2   22.2
Exp1    DDDD    3   22.3
Exp1    DDDD    2   22.4
Exp2    DDDD    3   25.2
Exp2    DDDD    2   25.3
Exp3    DDDD    3   27
Exp1    EEEE    2   28
Exp2    EEEE    3   29
Exp3    EEEE    4   30
Exp1    FFFF    2   33.2
Exp1    FFFF    3   33.4
Exp1    FFFF    4   33.6
Exp2    FFFF    2   35.1
Exp2    FFFF    3   35.2
Exp1    GGGG    2   40.1
Exp1    GGGG    2   40.2
Exp1    GGGG    2   40.3
Exp1    GGGG    2   42
Exp2    GGGG    2   42.3
Exp2    GGGG    2   44.3
Exp3    GGGG    2   45.3
Exp3    GGGG    2   45.4')
e <- length(unique(d$Experiment))
a <- aggregate(Experiment ~ Sequence+Parameter, data=d, FUN=function(x) length(unique(x)))
s1 <- merge(subset(a, Experiment==e)[1:2], d)
s1$n <- ave(s1$Parameter, FUN=length, s1$Sequence, s1$Parameter, s1$Experiment)
s2 <- subset(s1, n==1)

df

的版本
df <- data.frame(
Experiment= c("Exp1", "Exp2", "Exp3", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp3", "Exp3", "Exp3", "Exp1", "Exp2", "Exp1", "Exp1", "Exp2", "Exp2", "Exp3", "Exp1", "Exp2", "Exp3", "Exp1", "Exp1", "Exp1", "Exp2", "Exp2", "Exp1", "Exp1", "Exp1",   "Exp1", "Exp2", "Exp2", "Exp3", "Exp3"),
Sequence= c("AAAA", "AAAA", "AAAA", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB", "BBBB","BBBB", "BBBB", "CCCC", "CCCC", "DDDD", "DDDD", "DDDD", "DDDD", "DDDD", "EEEE", "EEEE", "EEEE", "FFFF", "FFFF", "FFFF", "FFFF", "FFFF", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG", "GGGG"),
Parameter= c("2", "2", "2", "2", "3", "4", "2", "3", "2", "3", "4", "5", "2", "2", "3", "2", "3", "2", "3", "2", "3", "4", "2", "3", "4", "2", "3", "2", "2", "2", "2", "2", "2", "2", "2"),
Time= c("10.0", "11.0", "12.0", "13.1", "13.2", "13.3", "14.1", "14.2", "16.3", "16.4", "16.5", "16.6", "20.0", "22.2", "22.3", "22.4", "25.2", "25.3", "27.0", "28.0","29.0", "30.0", "33.2", "33.4", "33.6", "35.1", "35.2", "40.1", "40.2", "40.3","42.0", "42.3", "44.3", "45.3", "45.4")
)
df$eins <- 1
e <- length(unique(df$Experiment))
a <- aggregate(Experiment ~ Sequence+Parameter, data=df, FUN=function(x) length(unique(x)))
s1 <- merge(subset(a, Experiment==e)[1:2], df)
s1$n <- ave(s1$eins, FUN=length, s1$Sequence, s1$Parameter, s1$Experiment)
s2 <- subset(s1, n==1)

我认为df$Time不应该是一个因素。

答案 2 :(得分:2)

require(dplyr)

# convenience variable aggregate of the two variables
df$seq.param <- with(df, paste(Sequence, Parameter))

# count number of unique experiments for each Sequence+Parameter
crosstab <- with(df, table(seq.param, Experiment))
seq.param.count <- rowSums(crosstab)

# identify those which appears for all experiment
selected <- names(seq.param.count[seq.param.count==length(unique(df$Experiment))])

dfs <- df %>%
  filter(seq.param %in% selected) %>%   # keep just all those appear in all
  group_by(Experiment, Sequence, Parameter) %>%
  summarize(count=n(), Time=Time[1]) %>%  # count how many times in each experiment
  filter(count==1) %>%   # keep just those that appears once
  select(-count)   # remove extra variable

输出(显示在示例中显示的排序中)

> dfs[order(dfs$Sequence, dfs$Parameter),]
Source: local data frame [12 x 5]
Groups: Experiment, Sequence [9]

   Experiment Sequence Parameter count   Time
       (fctr)   (fctr)    (fctr) (int) (fctr)
1        Exp1     AAAA         2     1   10.0
2        Exp2     AAAA         2     1   11.0
3        Exp3     AAAA         2     1   12.0
4        Exp1     BBBB         2     1   13.1
5        Exp2     BBBB         2     1   14.1
6        Exp3     BBBB         2     1   16.3
7        Exp1     BBBB         3     1   13.2
8        Exp2     BBBB         3     1   14.2
9        Exp3     BBBB         3     1   16.4
10       Exp1     DDDD         3     1   22.3
11       Exp2     DDDD         3     1   25.2
12       Exp3     DDDD         3     1   27.0