for循环/函数通过条件检查多个字符串来创建新变量

时间:2017-06-16 04:28:24

标签: r function for-loop if-statement

我不是新的R用户,但从来没有写过循环,我想学习,因为在这种情况下,我认为它会节省时间,更有意义。

我有一个大型数据集,其中包含不同森林类型的访问频率数据,其简化子集如下所示:

f_type_1 <- sample(c("daily", "weekly", "monthly", "annually", "never"), 10, replace = T)
f_type_2 <- sample(c("daily", "weekly", "monthly", "annually", "never"), 10, replace = T)
f_type_3 <- sample(c("daily", "weekly", "monthly", "annually", "never"), 10, replace = T)
f_type_4 <- sample(c("daily", "weekly", "monthly", "annually", "never"), 10, replace = T)
f_type_5 <- sample(c("daily", "weekly", "monthly", "annually", "never"), 10, replace = T)
forvis<-data.frame(f_type_1, f_type_2, f_type_3, f_type_4, f_type_5)

我对每个单独的林类型不感兴趣,而是想创建一个新的变量forvis$f_vis_freqency,它提供一般的森林访问频率。即我想写一个循环或一个遍历行的函数,并根据条件"daily"语句填充具有最高频率的新变量(在这种情况下,它将是if)。 if该行包含"daily",然后使用"daily" else if行包含"weekly",然后使用"weekly",依此类推,每月,每年和从不(在真实的数据集我还有很多要处理的事情。)

我已经使用嵌套ifelse调用的大型(真实数据中的73行代码)完成了此操作,但我觉得必须有更好的方法来执行此操作。 我使用的当前方法如下所示:

    forvis$f_visit_freqency<-ifelse(forvis$f_type_1=="daily" | forvis$f_type_2=="daily" | forvis$f_type_3 =="daily" | forvis$f_type_4 == "daily" | forvis$f_type_5 == "daily", "daily",
                                 ifelse(forvis$f_type_1=="weekly" | forvis$f_type_2=="weekly" | forvis$f_type_3 =="weekly" | forvis$f_type_4 == "weekly" | forvis$f_type_5 == "weekly", "weekly",
                                        ifelse(forvis$f_type_1=="monthly" | forvis$f_type_2=="monthly" | forvis$f_type_3 =="monthly" | forvis$f_type_4 == "monthly" | forvis$f_type_5 == "monthly", "monthly",
                                               ifelse(forvis$f_type_1=="annually" | forvis$f_type_2=="annually" | forvis$f_type_3 =="annually" | forvis$f_type_4 == "annually" | forvis$f_type_5 == "annually", "annually",
                                                      ifelse(forvis$f_type_1=="never" | forvis$f_type_2=="never" | forvis$f_type_3 =="never" | forvis$f_type_4 == "never" | forvis$f_type_5 == "never", "never",
                                                             NA))))

                                 )

1 个答案:

答案 0 :(得分:1)

解决方案1:使用dplyr和tidyr

中的函数

我很自然地将这样的数据帧从宽格式转换为长格式,然后处理数据。 dplyrtidyr在这方面做得很好。

# Load packages
library(dplyr)
library(tidyr)

# Process the data    
forvis2 <- forvis %>%
  # Create an ID for each person
  mutate(Person = row_number()) %>%
  # Reshape the data frame to long format
  gather(Forest, Frequency, -Person) %>%
  # Convert the Frequency column to factor
  mutate(Frequency = factor(Frequency,
                            levels = c("daily", "weekly", "monthly",
                                       "annually", "never"))) %>%
  # Arrange the data by Person, then by Frequency
  arrange(Person, Frequency) %>%
  # Group by person
  group_by(Person) %>%
  # Only keep the first row for each person
  slice(1)

# Add the frequency to forvis
forvis$f_vis_freqency <- forvis2$Frequency

解决方案2:使用基础R

中的函数
# Transpose the data frame
forvis2 <- as.data.frame(t(forvis))

# Covert each column to the right factor level, save as a list
forvis_list <- lapply(forvis2, 
                      factor, 
                      levels = c("daily", "weekly", "monthly", "annually", "never"))

# Sort each column and select the first one
# Store the result to f_vis_freqency as a new column to forvis
forvis$f_vis_freqency <- sapply(lapply(forvis_list, sort), `[`, 1)

解决方案3:使用供应对每一行进行排序

此解决方案的关键是我们可以使用unlist将数据帧的行转换为向量。如果定义了vector,那么我们可以sort向量。

# Convert all columns in forvis to the right factor level
forvis2 <- as.data.frame(lapply(forvis, factor, 
                                levels = c("daily", "weekly", "monthly",
                                "annually", "never")))

# Use sapply to apply a function
forvis$f_vis_freqency <- sapply(1:nrow(forvis2),
                            function(i) {sort(unlist(forvis2[i, ]))[1]})

解决方案4:定义一个函数,然后在每一行中使用sapply

我们可以定义一个函数来返回一行的答案,然后使用sapply将此函数应用于每一行。在这里,我演示了两个这样的功能。

* A:使用多个ifelse语句的函数*

请注意,此功能不要求用户将列转换为factor。它可以在character上使用。

# A function to report the frequency label
# i is the row index, while dt is the input data frame
report_freq <- function(i, dt = forvis){

  temp <- unique(unlist(dt[i, ]))

  # Use multiple ifelse statement to test if there are any matching labels
  answer <- ifelse(any(temp %in% "daily"), "daily",
                   ifelse(any(temp %in% "weekly"), "weekly",
                          ifelse(any(temp %in% "monthly"), "monthly",
                                 ifelse(any(temp %in% "annually"), "annually", "never"))))
  return(answer)
}

# Use sapply to apply the function
forvis$f_vis_freqency <- sapply(1:nrow(forvis), report_freq, dt = forvis)

* B:使用因子级别和排序*

的函数

此方法无需将数据框的每个元素转换为factor。它从每一行读入数据并返回答案。

# A function to report the frequency label
# i is the row index, while dt is the input data frame
report_freq <- function(i, dt = forvis){

  temp <- unique(unlist(dt[i, ]))

  temp <- factor(temp, levels = c(levels = c("daily", "weekly", "monthly",
                                             "annually", "never")))
  answer <- sort(temp)[1]

  return(answer)
}

# Use sapply to apply the function
forvis$f_vis_freqency <- sapply(1:nrow(forvis), report_freq, dt = forvis)

解决方案5:将因子转换为数字,然后找到最小值

# Create the factor level as a vector
factor_level <- c("daily", "weekly", "monthly", "annually", "never")

# Find the index  
ind <- apply(as.data.frame(lapply(forvis, function(x) {
x <- as.numeric(factor(x, levels = factor_level))})), 1, min)

# Return the index with associated character  
forvis$f_vis_freqency <- factor_level[ind]

绩效衡量指标

我使用microbenchmark库来衡量效果。结果如下。

Unit: microseconds
 expr       min        lq       mean    median         uq        max neval
   S0   582.465   637.633   732.5772   654.953   675.4800  79961.281  5000
   S1 14394.199 15048.831 16353.5785 15363.477 16724.0580 100905.644  5000
   S2  1594.723  1667.852  1841.7731  1710.831  1800.6380   7175.609  5000
   S3  3208.689  3319.986  3681.2132  3394.399  3619.2380  59523.688  5000
  S4a  2041.194  2171.415  2438.0034  2232.034  2367.0660  59986.837  5000
  S4b  3327.363  3457.585  3832.7902  3547.712  3774.1555  58306.156  5000
   S5   554.882   624.803   686.8600   645.972   672.9145   5789.369  5000

令我惊讶的是,解决方案0(OP的原始解决方案)非常快,而解决方案5是最快的。

这是代码。

library(microbenchmark)

# Solution 0
F0 <- function(){
  ifelse(forvis$f_type_1=="daily" | forvis$f_type_2=="daily" | forvis$f_type_3 =="daily" | forvis$f_type_4 == "daily" | forvis$f_type_5 == "daily", "daily",
         ifelse(forvis$f_type_1=="weekly" | forvis$f_type_2=="weekly" | forvis$f_type_3 =="weekly" | forvis$f_type_4 == "weekly" | forvis$f_type_5 == "weekly", "weekly",
                ifelse(forvis$f_type_1=="monthly" | forvis$f_type_2=="monthly" | forvis$f_type_3 =="monthly" | forvis$f_type_4 == "monthly" | forvis$f_type_5 == "monthly", "monthly",
                       ifelse(forvis$f_type_1=="annually" | forvis$f_type_2=="annually" | forvis$f_type_3 =="annually" | forvis$f_type_4 == "annually" | forvis$f_type_5 == "annually", "annually",
                              ifelse(forvis$f_type_1=="never" | forvis$f_type_2=="never" | forvis$f_type_3 =="never" | forvis$f_type_4 == "never" | forvis$f_type_5 == "never", "never",
                                     NA))))

  )
}

# Solution 1
F1 <- function(){
  forvis2 <- forvis %>%
    mutate(Person = row_number()) %>%
    gather(Forest, Frequency, -Person) %>%
    mutate(Frequency = factor(Frequency,
                              levels = c("daily", "weekly", "monthly",
                                         "annually", "never"))) %>%
    arrange(Person, Frequency) %>%
    group_by(Person) %>%
    slice(1)

  forvis2$Frequency
}

# Solution 2
F2 <- function(){
  forvis2 <- as.data.frame(t(forvis))
  forvis_list <- lapply(forvis2, 
                        factor, 
                        levels = c("daily", "weekly", "monthly", "annually", "never"))
  sapply(lapply(forvis_list, sort), `[`, 1)
}

# Solution 3
F3 <- function(){
  forvis2 <- as.data.frame(lapply(forvis, factor, 
                                  levels = c("daily", "weekly", "monthly",
                                             "annually", "never")))
  sapply(1:nrow(forvis2), function(i) {sort(unlist(forvis2[i, ]))[1]})
}

# Solution 4a
F4a <- function(){
  report_freq <- function(i, dt = forvis){

    temp <- unique(unlist(dt[i, ]))
    answer <- ifelse(any(temp %in% "daily"), "daily",
                     ifelse(any(temp %in% "weekly"), "weekly",
                            ifelse(any(temp %in% "monthly"), "monthly",
                                   ifelse(any(temp %in% "annually"), "annually", "never"))))
    return(answer)
  }
  sapply(1:nrow(forvis), report_freq, dt = forvis)
}

# Solution 4b
F4b <- function(){
  report_freq <- function(i, dt = forvis){

    temp <- unique(unlist(dt[i, ]))
    temp <- factor(temp, levels = c(levels = c("daily", "weekly", "monthly",
                                               "annually", "never")))
    answer <- sort(temp)[1]
    return(answer)
  }
  sapply(1:nrow(forvis), report_freq, dt = forvis)
}

# Solution 5
F5 <- function(){
  factor_level <- c("daily", "weekly", "monthly", "annually", "never")
  ind <- apply(as.data.frame(lapply(forvis, function(x) {
    x <- as.numeric(factor(x, levels = factor_level))})), 1, min)
  factor_level[ind]
}

# Measure the performance
microbenchmark(
  S0 = F0(),
  S1 = F1(),
  S2 = F2(),
  S3 = F3(),
  S4a = F4a(),
  S4b = F4b(),
  S5 = F5(),
  times = 5000
)