Question

我想在一个数据集上放两个函数，条件是特定变量的值。

first_function <- function(x) {return (x + 0)}
second_function <- function(x) {return (x + 1)}

df <- data.frame(Letters = c("A","B","B"), Numbers = 1:3)

类似：

df <- lapply(df, if(df$Letters=="A") first_function else second_function )

产生：

df_desired <- data.frame(Letters = c("A","B","B"), Numbers = c(1,3,4))

Answer 1

您可以使用dplyr和purrr进行操作。显然，这是一个基本功能，但是您应该能够根据需要进行构建：

library(dplyr)
library(purrr)
calc <- function(y, x){
  first_function <- function(x) {return (x + 0)}
  second_function <- function(x) {return (x + 1)}

  if(y == "A")
    return(first_function(x))

    return(second_function(x))
}

df <- data.frame(Letters = c("A","B","B"), Numbers = 1:3)

df %>% 
  mutate(Numbers = map2_dbl(Letters, Numbers, ~calc(.x,.y)))

  Letters Numbers
1       A       1
2       B       3
3       B       4

>(df_desired <- data.frame(Letters = c("A","B","B"), Numbers = c(1,3,4)))
      Letters Numbers
1       A       1
2       B       3
3       B       4

Benchmarking

我不是data.table专家（可以随意添加），因此未在此处合并。但是，@ R Yoda是正确的。尽管它读起来不错，将来您会发现它更易于阅读和扩展该功能，但是purrr解决方案并不是那么快。我喜欢ifelse方法，因此添加了case_when，它在处理多个功能时更易于扩展。以下是一些解决方案：

library(dplyr)
library(purrr)
library(microbenchmark)

first_function <- function(x) {return (x + 0)}
second_function <- function(x) {return (x + 1)}

calc <- function(y, x){
  if(y == "A")
    return(first_function(x))

    return(second_function(x))
}

df <- data.frame(Letters = rep(c("A","B","B"),1000), Numbers = 1:3)

basic <- function(){
  data.frame(df$Letters, apply(df, 1, function(row) {
    num <- as.numeric(row['Numbers'])
    if (row['Letters'] == 'A') first_function(num) else second_function(num)
  }))
}

dplyr_purrr <- function(){
  df %>% 
    mutate(Numbers = map2_dbl(Letters, Numbers, ~calc(.x,.y)))
}

dplyr_case_when <- function(){
  df %>% 
    mutate(Numbers = case_when(
        Letters == "A" ~ first_function(Numbers),
        TRUE ~ second_function(Numbers)))
}

map_list <- function(){
   data.frame(df$Letters, map2_dbl(df2$Letters, df2$Numbers, ~calc(.x, .y)))
}

within_mapply <- function(){
  within(df, Numbers <- mapply(Letters, Numbers, 
                               FUN = function(x, y){
    switch(x, 
           "A" = first_function(y),
           "B" = second_function(y))
    }))
}

within_ifelse <- function(){

  within(df, Numbers <- ifelse(Letters == "A",
                               first_function(Numbers),
                               second_function(Numbers)))
}

within_case_when <- function(){

  within(df, Numbers <- case_when(
    Letters == "A" ~ first_function(Numbers),
    TRUE ~ second_function(Numbers)))
}

(mbm <- microbenchmark(
  basic(),
  dplyr_purrr(),
  dplyr_case_when(),
  map_list(),
  within_mapply(),
  within_ifelse(),
  within_case_when(),
  times = 1000
))

Unit: microseconds
               expr       min         lq       mean     median        uq        max neval    cld
            basic() 12816.427 24028.3375 27719.8182 26741.7770 29417.267 277756.650  1000      f
      dplyr_purrr()  9682.884 17817.0475 20072.2752 19736.8445 21767.001  48344.265  1000     e 
  dplyr_case_when()  1098.258  2096.2080  2426.7183  2325.7470  2625.439   9039.601  1000  b    
         map_list()  8764.319 16873.8670 18962.8540 18586.2790 20599.000  41524.564  1000    d  
    within_mapply()  6718.368 12397.1440 13806.1752 13671.8120 14942.583  24958.390  1000   c   
    within_ifelse()   279.796   586.6675   690.1919   653.3345   737.232   8131.292  1000 a     
 within_case_when()   470.155   955.8990  1170.4641  1070.5655  1219.284  46736.879  1000 a

Answer 2

使用*apply进行此操作的简单方法是将整个逻辑（带有条件函数和两个函数）放入另一个函数，并与apply一起使用MARGIN=1来传递逐行显示数据（lapply将按列传递数据）

apply(df, 1, function(row) {
    num <- as.numeric(row['Numbers'])
    if (row['Letters'] == 'A') first_function(num) else second_function(num)
    })

[1] 1 3 4

在@ r2evans中，此方法的问题在下面的注释中指出，是当您将apply与异构data.frame一起使用时（在这种情况下，Letters的类型为{{1 }} factor的类型为Numbers时，传递给应用函数的每一行均作为向量传递，该向量只能具有单个类型，因此该行中的所有内容都被强制转换为同一类型（在此情况下情况1}}）。这就是为什么必须使用integer来将character转换为类型as.numeric(row['Numbers'])的原因。根据您的数据，这可能是一个简单的修复（如上所述），也可能使事情变得更加复杂且容易发生错误。无论哪种方式，@ akrun的解决方案都更好，因为它保留了每个变量的原始数据类型。

Answer 3

lapply在这种情况下比较困难，因为它是基于列的。但是，您可以尝试通过t()来转置数据，并在持续的情况下使用lapply。在这里，我提供了两种使用mapply和ifelse的方式：

df$Letters <- as.character(df$Letters)

# Method 1
within(df, Numbers <- mapply(Letters, Numbers, FUN = function(x, y){
                             switch(x, "A" = first_function(y),
                                       "B" = second_function(y))
}))

# Method 2
within(df, Numbers <- ifelse(Letters == "A",
                             first_function(Numbers),
                             second_function(Numbers)))

以上两者均具有相同的输出：

#   Letters Numbers
# 1       A       1
# 2       B       3
# 3       B       4

Answer 4

这里有一个data.table变体，可以在有许多数据行的情况下获得更好的性能（但也会显示隐式转换问题）：

library(data.table)

setDT(df)  # fast convertion from data.frame to data.table

df[  Letters == "A",  Numbers := first_function(Numbers) ]
df[!(Letters == "A"), Numbers := second_function(Numbers)]  # issues a warning, see below

df
# Letters Numbers
# 1:       A       1
# 2:       B       3
# 3:       B       4

发出的警告是：

警告消息：在[.data.table（df，！（字母==“ A”）中， :=（Numbers，second_function（Numbers）））：将“双” RHS强制为 'integer'匹配列的类型；可能具有截断的精度。首先将目标列['Numbers']更改为'double'（通过创建一个新的'double'向量长度3（整个表的行），然后分配;即“替换”列），或将RHS强制为“整数”（例如 1L，NA_ [real | integer] _，as。*等），以使您的意图清晰明了速度。或者，在创建桌子并坚持下去。

原因是data.frame列Numbers是整数

> str(df)
'data.frame':   3 obs. of  2 variables:
 $ Letters: Factor w/ 2 levels "A","B": 1 2 2
 $ Numbers: int  1 2 3

但是函数返回双精度（出于某种原因）：

> typeof(first_function(df$Numbers))
[1] "double"

可变值的lappy

4 个答案: