Question

我有一个大型数据库，如下所示：

      >tms
    expId     id     date sessionNr waveSh ipi isi perc    ampl qual eventNr
1   b80M1 myrthe 20131206         1      2  20   1   80  416.10    1     145
2   b80M1 myrthe 20131206         1      2   4   2   80  366.80    1     146
3   b80M1 myrthe 20131206         1      2   4   3   80  411.60    1     147
    ..... ...... ........         .      .   .   .   ..  ......    .     ... 
    ..... ...... ........         .      .   .   .   ..  ......    .     ...
24  m80M1 myrthe 20131218         1      1  20   2   80   58.10    1     266
25  m80M1 myrthe 20131218         1      1   4   1   80   22.60    0     267
26  m80M1 myrthe 20131218         1      1   4   3   80   21.90    0     268
    ..... ...... ........         .      .   .   .   ..  ......    .     ...
    ..... ...... ........         .      .   .   .   ..  ......    .     ...
201 h80M1 myrthe 20131219         1      3   5   3   80   33.00    0     194
202 h80M1 myrthe 20131219         1      3   6   1   80   52.50    1     195
203 h80M1 myrthe 20131219         1      3   4   4   80  314.20    1     196

在每个tms $ expId中，我想创建一个名为tms $ norm的新变量。此变量表示在tms $ expId中tms $ ampl与tms $ isi == 1的平均值之间的比率，该值为tms $ ampl / mean（tms [tms $ isi == 1，] $ ampl）。

我可以像这样长时间运行，为每个tms $ expId手动分组：

    b80L1 <- tms[tms$expId==b80L1,]
    attach(b80L1)
    b80L1$norm <- b80L1$ampl/mean(b80L1[b80L1$isi==1,]$ampl)
    detach(b80l1)

    m80M1 <- tms[tms$expId==m80M1,]
    attach(m80M1)
    M80M1$norm <- M80M1$ampl/mean(m80M1[m80M1$isi==1,]$ampl)
    detach(m80M1)

    h80M1 <- h80M1[h80M1$expId==h80M1,]
    attach(h80M1)
    h80M1$norm <- h80M1$ampl/mean(h80M1[h80M1$isi==1,]$ampl)
    detach(h80M1)

然后在一个数据框中再次组合所有子集，如下所示：

    tmsNorm <- rbind(b80L1,m80M1,h80M1)

然后tmsNorm数据库看起来像这样：

      >tmsNorm
    expId     id     date sessionNr waveSh ipi isi perc    ampl qual eventNr  norm
1   b80M1 myrthe 20131206         1      2  20   1   80  416.10    1     145  0.6547
2   b80M1 myrthe 20131206         1      2   4   2   80  366.80    1     146  0.5667
3   b80M1 myrthe 20131206         1      2   4   3   80  411.60    1     147  0.6530
    ..... ...... ........         .      .   .   .   ..  ......    .     ...  ...
    ..... ...... ........         .      .   .   .   ..  ......    .     ...  ...
24  m80M1 myrthe 20131218         1      1  20   2   80   58.10    1     266  0.0123
25  m80M1 myrthe 20131218         1      1   4   1   80   22.60    0     267  0.0056
26  m80M1 myrthe 20131218         1      1   4   3   80   21.90    0     268  0.0057
    ..... ...... ........         .      .   .   .   ..  ......    .     ...  ...
    ..... ...... ........         .      .   .   .   ..  ......    .     ...  ...
201 h80M1 myrthe 20131219         1      3   5   3   80   33.00    0     194  0.0045
202 h80M1 myrthe 20131219         1      3   6   1   80   52.50    1     195  0.0053
203 h80M1 myrthe 20131219         1      3   4   4   80  314.20    1     196  0.0145

但是，由于我有大约100种类型的tmse $ expId，我真的想使用循环函数或某种应用函数创建这个tms $ norm变量。

我尝试使用此代码无法正常工作但希望能说明我要实现的目标：

    uniq <- unique(unlist(tms$expId))
   > for(i in 1:length(uniq)){
       attach(tms[tms$expId==uniq[i], ])
       tms$normReal2 <- tms[tms$expId==uniq[i], ]$realAmpl/mean(tms[(tms$expId==uniq[i]) |       (tms$isi==1),]$realAmpl)
       detach(tms[tms$expId==uniq[i], ])
     }

所以我的问题是：如何使用非常紧凑的代码创建这个tms $ norm变量？

提前非常感谢你！

Answer 1

尝试使用dplyr。

install.packages('dplyr')
require(dplyr)

tms <- group_by(tms, expId)
tms <- mutate(tms, norm = ampl / mean(ampl[isi == 1]))

Answer 2

require(data.table)

set.seed(123)
tms <- data.table(row_id = 1:10e5,
                  expId = letters,
                  isi = c(1,2,3),
                  ampl = rnorm(10e5, 300, 100)
)

修改

# Keep the original table tms[isi==1, isi1_mean:=mean(ampl), by=expId] # The mean of the ampl column when isi==1 by expId tms[isi==1, norm:=(ampl/isi1_mean), by=expId] # Keep a subset where isi==1 tms <- data.table(row_id = 1:10e5, expId = letters, isi = c(1,2,3), ampl = rnorm(10e5, 300, 100) ) tms[isi==1, isi1_mean:=mean(ampl), by=expId] # The mean of the ampl column when isi==1 by expId tms[, norm:=(ampl/isi1_mean), by=expId][isi==1]

<强>基准

我正在尝试将dplyr版本设为data.table，将其中一个版本设为data.frame。我相信这是方法，但如果有不正确的地方，请编辑基准。

require(dplyr) require(data.table) set.seed(123) tms_dt <- data.table(row_id = 1:10e6, expId = letters, isi = c(1,2,3), ampl = rnorm(10e6, 300, 100) ) tms_df <- as.data.frame(tms_dt) dt_dplyr <- function(data) { require(dplyr) data <- group_by(data, expId) data <- mutate(data, norm = ampl / mean(ampl[isi == 1])) } df_dplyr <- function(data) { require(dplyr) data <- group_by(data, expId) data <- mutate(data, norm = ampl / mean(ampl[isi == 1])) } dt_datatable <- function(data) { data[isi==1, isi1_mean:=mean(ampl), by=expId] data[isi==1, norm:=(ampl/isi1_mean), by=expId] } require(rbenchmark) benchmark(dt_dplyr(tms_dt), df_dplyr(tms_df), dt_datatable(tms_dt)) test replications elapsed relative user.self sys.self 2 df_dplyr(tms_df) 100 135.08 1.447 108.81 20.94 3 dt_datatable(tms_dt) 100 93.36 1.000 76.15 16.63 1 dt_dplyr(tms_dt) 100 275.28 2.949 105.34 72.63

Answer 3

强烈建议不要花时间阅读此内容：http://www.jstatsoft.org/v40/i01/paper Split Apply Combine

在这种情况下，我认为你想要

install.packages(plyr)
require(plyr)
ddply(tms, .(expId), function(x) {data.frame(x, norm=x$ampl/mean(x[x$isi==1,]["ampl"]))})

打扰一下，如果这个答案出现错字 - 不能自己测试。

计算一个大型数据库的一系列子集中的新变量

3 个答案: