
时间:2016-04-11 22:54:36

标签: r function loops dplyr rbind



首先,我想(1)通过另一个变量(在我的情况下,扬声器)标准化变量,以及(2)在变量标准化后过滤掉值(大于2个标准)偏离平均值的偏差)。 (1)和(2)可以通过使用dplyr的函数来处理。




df = data.frame(speaker=c("eng1","eng1","eng1","eng1","eng1","eng1","eng2","eng2","eng2","eng2","eng2"),


     speaker ratio_means001 ratio_means002
1     eng1          0.560          0.660
2     eng1          0.202          0.203
3     eng1          0.695          0.943
4     eng1          0.436          0.432
5     eng1          0.342          0.345
6     eng1         10.100          0.439
7     eng2          0.257          0.154
8     eng2          0.123          0.234
9     eng2          0.432             NA
10    eng2          0.496          0.932
11    eng2          0.832          0.854


standardized_data = group_by(df, speaker) %>%
mutate(zRatio1 = as.numeric(scale(ratio_means001)))%>%
filter(!abs(zRatio1) > 2)


     speaker ratio_means001 ratio_means002   zRatio1
     (fctr)          (dbl)          (dbl)     (dbl)
 1     eng1          0.560          0.660 -0.3792191
 2     eng1          0.202          0.203 -0.4699781
 3     eng1          0.695          0.943 -0.3449943
 4     eng1          0.436          0.432 -0.4106552
 5     eng1          0.342          0.345 -0.4344858
 6     eng2          0.257          0.154 -0.6349445
 7     eng2          0.123          0.234 -1.1325034
 8     eng2          0.432             NA  0.0148525
 9     eng2          0.496          0.932  0.2524926
 10    eng2          0.832          0.854  1.5001028

到目前为止,这是我的功能。 mutate部分有效,但我一直在努力添加过滤器部分:

standardize_variable = function(col1, new_col_name) {
     mutate_call = lazyeval::interp(b = interp(~ scale(a)), a = as.name(col1))
     group_by(data,speaker) %>% 
     mutate_(.dots = setNames(list(mutate_call), new_col_name)) %>%
     filter_(interp(~ !abs(b) > 2.5, b = as.name(new_col_name))) # this part does not work


data = standardize_variable("ratio_means001","zRatio1")

Error in substitute_(`_obj`[[2]], values) : 
argument "_obj" is missing, with no default



d <- data.frame()
for(i in 1:2) 
 col1 <- paste("ratio_means00", i, sep = "")
 new_col <- paste("zRatio", i, sep = "")
 d <- rbind(d, standardize_variable(col1, new_col))


 Error in match.names(clabs, names(xi)) : 
 names do not match previous names 


1 个答案:

答案 0 :(得分:0)


我相信你使用函数时遇到的主要问题与你调用interp两次有关。修复导致filter的其他问题,我认为这是由于scale添加了属性(我使用的是开发版 dplyr ,dplyr_0.4.3.9001) 。在as.numeric周围包裹scale可以摆脱它。


standardize_variable = function(col1, new_col_name) {
    mutate_call = lazyeval::interp(~as.numeric(scale(a)), a = as.name(col1))
    group_by(df, speaker) %>% 
        mutate_(.dots = setNames(list(mutate_call), new_col_name)) %>%
        filter_(interp(~ !abs(b) > 2, b = as.name(new_col_name)))


d = list()
for(i in 1:2) { 
    col1 <- paste("ratio_means00", i, sep = "")
    new_col <- paste("zRatio", i, sep = "")
    d[[i]] = standardize_variable(col1, new_col)

do.call(merge, d)

  speaker ratio_means001 ratio_means002    zRatio1    zRatio2
1    eng1          0.202          0.203 -0.4699781 -1.1490444
2    eng1          0.342          0.345 -0.4344858 -0.6063693
3    eng1          0.436          0.432 -0.4106552 -0.2738853
4    eng1          0.560          0.660 -0.3792191  0.5974521
5    eng1          0.695          0.943 -0.3449943  1.6789806
6    eng2          0.123          0.234 -1.1325034 -0.7620572
7    eng2          0.257          0.154 -0.6349445 -0.9590348
8    eng2          0.496          0.932  0.2524926  0.9565726
9    eng2          0.832          0.854  1.5001028  0.7645194



在下面的代码中,我利用mutate_each允许从 dplyr_0.4.3.9001 开始为单个函数命名的事实。在rename_中,事情看起来有点复杂,因为我正在为新列创建您想要的名称。要简化一些事情,您可以将它们从_z mutate_each结束,并使用rename_gsub保存grepl的复杂步骤。

df2 = df %>%
    group_by(speaker) %>%
    mutate_each(funs(z = as.numeric(scale(.))), starts_with("ratio_means00")) %>%
    rename_(.dots = setNames(names(.)[grepl("z", names(.))], 
                        paste0("zR", gsub("r|_z|_means00", "", names(.)[grepl("z", names(.))]))))




如果我重塑数据集而不是跨列工作,我经常发现这些问题最为直接。例如,仍然使用最新版本的dots = lapply(names(df2)[starts_with("z", vars = names(df2))], function(y) interp(~abs(x) < 2, x = as.name(y))) filter_(df2, .dots = dots) Source: local data frame [9 x 5] Groups: speaker [2] speaker ratio_means001 ratio_means002 zRatio1 zRatio2 (fctr) (dbl) (dbl) (dbl) (dbl) 1 eng1 0.560 0.660 -0.3792191 0.5974521 2 eng1 0.202 0.203 -0.4699781 -1.1490444 3 eng1 0.695 0.943 -0.3449943 1.6789806 4 eng1 0.436 0.432 -0.4106552 -0.2738853 5 eng1 0.342 0.345 -0.4344858 -0.6063693 6 eng2 0.257 0.154 -0.6349445 -0.9590348 7 eng2 0.123 0.234 -1.1325034 -0.7620572 8 eng2 0.496 0.932 0.2524926 0.9565726 9 eng2 0.832 0.854 1.5001028 0.7645194 但跳过重命名步骤以简化您可以使用 tidyr 中的mutate_each函数gather将所有标准化列组合在一起然后gather新列。


如果所需的最终表单是宽格式,您可以使用library(tidyr) df %>% group_by(speaker) %>% mutate_each(funs(z = as.numeric(scale(.))), starts_with("ratio_means00")) %>% gather(group, zval, ends_with("_z")) %>% filter(abs(zval) <2 ) # First 12 lines of output Source: local data frame [20 x 5] Groups: speaker [2] speaker ratio_means001 ratio_means002 group zval <fctr> <dbl> <dbl> <chr> <dbl> 1 eng1 0.560 0.660 ratio_means001_z -0.3792191 2 eng1 0.202 0.203 ratio_means001_z -0.4699781 3 eng1 0.695 0.943 ratio_means001_z -0.3449943 4 eng1 0.436 0.432 ratio_means001_z -0.4106552 5 eng1 0.342 0.345 ratio_means001_z -0.4344858 6 eng2 0.257 0.154 ratio_means001_z -0.6349445 7 eng2 0.123 0.234 ratio_means001_z -1.1325034 8 eng2 0.432 NA ratio_means001_z 0.0148525 9 eng2 0.496 0.932 ratio_means001_z 0.2524926 10 eng2 0.832 0.854 ratio_means001_z 1.5001028 11 eng1 0.560 0.660 ratio_means002_z 0.5974521 12 eng1 0.202 0.203 ratio_means002_z -1.1490444 ... (也来自 tidyr 。对我而言,一个优点是您可以保留所有值即使另一个变量未通过过滤步骤,也会变量。


如果您不想保留df %>% group_by(speaker) %>% mutate_each(funs(z = as.numeric(scale(.))), starts_with("ratio_means00")) %>% gather(group, zval, ends_with("_z")) %>% filter(abs(zval) <2 ) %>% spread(group, zval) Source: local data frame [11 x 5] Groups: speaker [2] speaker ratio_means001 ratio_means002 ratio_means001_z ratio_means002_z <fctr> <dbl> <dbl> <dbl> <dbl> 1 eng1 0.202 0.203 -0.4699781 -1.1490444 2 eng1 0.342 0.345 -0.4344858 -0.6063693 3 eng1 0.436 0.432 -0.4106552 -0.2738853 4 eng1 0.560 0.660 -0.3792191 0.5974521 5 eng1 0.695 0.943 -0.3449943 1.6789806 6 eng1 10.100 0.439 NA -0.2471337 7 eng2 0.123 0.234 -1.1325034 -0.7620572 8 eng2 0.257 0.154 -0.6349445 -0.9590348 9 eng2 0.432 NA 0.0148525 NA 10 eng2 0.496 0.932 0.2524926 0.9565726 11 eng2 0.832 0.854 1.5001028 0.7645194 ,可以在以后随时NA