我不是新的R用户,但从来没有写过循环,我想学习,因为在这种情况下,我认为它会节省时间,更有意义。
我有一个大型数据集,其中包含不同森林类型的访问频率数据,其简化子集如下所示:
f_type_1 <- sample(c("daily", "weekly", "monthly", "annually", "never"), 10, replace = T)
f_type_2 <- sample(c("daily", "weekly", "monthly", "annually", "never"), 10, replace = T)
f_type_3 <- sample(c("daily", "weekly", "monthly", "annually", "never"), 10, replace = T)
f_type_4 <- sample(c("daily", "weekly", "monthly", "annually", "never"), 10, replace = T)
f_type_5 <- sample(c("daily", "weekly", "monthly", "annually", "never"), 10, replace = T)
forvis<-data.frame(f_type_1, f_type_2, f_type_3, f_type_4, f_type_5)
我对每个单独的林类型不感兴趣,而是想创建一个新的变量forvis$f_vis_freqency
,它提供一般的森林访问频率。即我想写一个循环或一个遍历行的函数,并根据条件"daily"
语句填充具有最高频率的新变量(在这种情况下,它将是if
)。 if
该行包含"daily"
,然后使用"daily"
else if
行包含"weekly"
,然后使用"weekly"
,依此类推,每月,每年和从不(在真实的数据集我还有很多要处理的事情。)
我已经使用嵌套ifelse
调用的大型(真实数据中的73行代码)完成了此操作,但我觉得必须有更好的方法来执行此操作。
我使用的当前方法如下所示:
forvis$f_visit_freqency<-ifelse(forvis$f_type_1=="daily" | forvis$f_type_2=="daily" | forvis$f_type_3 =="daily" | forvis$f_type_4 == "daily" | forvis$f_type_5 == "daily", "daily",
ifelse(forvis$f_type_1=="weekly" | forvis$f_type_2=="weekly" | forvis$f_type_3 =="weekly" | forvis$f_type_4 == "weekly" | forvis$f_type_5 == "weekly", "weekly",
ifelse(forvis$f_type_1=="monthly" | forvis$f_type_2=="monthly" | forvis$f_type_3 =="monthly" | forvis$f_type_4 == "monthly" | forvis$f_type_5 == "monthly", "monthly",
ifelse(forvis$f_type_1=="annually" | forvis$f_type_2=="annually" | forvis$f_type_3 =="annually" | forvis$f_type_4 == "annually" | forvis$f_type_5 == "annually", "annually",
ifelse(forvis$f_type_1=="never" | forvis$f_type_2=="never" | forvis$f_type_3 =="never" | forvis$f_type_4 == "never" | forvis$f_type_5 == "never", "never",
NA))))
)
答案 0 :(得分:1)
我很自然地将这样的数据帧从宽格式转换为长格式,然后处理数据。 dplyr
和tidyr
在这方面做得很好。
# Load packages
library(dplyr)
library(tidyr)
# Process the data
forvis2 <- forvis %>%
# Create an ID for each person
mutate(Person = row_number()) %>%
# Reshape the data frame to long format
gather(Forest, Frequency, -Person) %>%
# Convert the Frequency column to factor
mutate(Frequency = factor(Frequency,
levels = c("daily", "weekly", "monthly",
"annually", "never"))) %>%
# Arrange the data by Person, then by Frequency
arrange(Person, Frequency) %>%
# Group by person
group_by(Person) %>%
# Only keep the first row for each person
slice(1)
# Add the frequency to forvis
forvis$f_vis_freqency <- forvis2$Frequency
# Transpose the data frame
forvis2 <- as.data.frame(t(forvis))
# Covert each column to the right factor level, save as a list
forvis_list <- lapply(forvis2,
factor,
levels = c("daily", "weekly", "monthly", "annually", "never"))
# Sort each column and select the first one
# Store the result to f_vis_freqency as a new column to forvis
forvis$f_vis_freqency <- sapply(lapply(forvis_list, sort), `[`, 1)
此解决方案的关键是我们可以使用unlist
将数据帧的行转换为向量。如果定义了vector,那么我们可以sort
向量。
# Convert all columns in forvis to the right factor level
forvis2 <- as.data.frame(lapply(forvis, factor,
levels = c("daily", "weekly", "monthly",
"annually", "never")))
# Use sapply to apply a function
forvis$f_vis_freqency <- sapply(1:nrow(forvis2),
function(i) {sort(unlist(forvis2[i, ]))[1]})
我们可以定义一个函数来返回一行的答案,然后使用sapply
将此函数应用于每一行。在这里,我演示了两个这样的功能。
* A:使用多个ifelse语句的函数*
请注意,此功能不要求用户将列转换为factor
。它可以在character
上使用。
# A function to report the frequency label
# i is the row index, while dt is the input data frame
report_freq <- function(i, dt = forvis){
temp <- unique(unlist(dt[i, ]))
# Use multiple ifelse statement to test if there are any matching labels
answer <- ifelse(any(temp %in% "daily"), "daily",
ifelse(any(temp %in% "weekly"), "weekly",
ifelse(any(temp %in% "monthly"), "monthly",
ifelse(any(temp %in% "annually"), "annually", "never"))))
return(answer)
}
# Use sapply to apply the function
forvis$f_vis_freqency <- sapply(1:nrow(forvis), report_freq, dt = forvis)
* B:使用因子级别和排序*
的函数此方法无需将数据框的每个元素转换为factor
。它从每一行读入数据并返回答案。
# A function to report the frequency label
# i is the row index, while dt is the input data frame
report_freq <- function(i, dt = forvis){
temp <- unique(unlist(dt[i, ]))
temp <- factor(temp, levels = c(levels = c("daily", "weekly", "monthly",
"annually", "never")))
answer <- sort(temp)[1]
return(answer)
}
# Use sapply to apply the function
forvis$f_vis_freqency <- sapply(1:nrow(forvis), report_freq, dt = forvis)
# Create the factor level as a vector
factor_level <- c("daily", "weekly", "monthly", "annually", "never")
# Find the index
ind <- apply(as.data.frame(lapply(forvis, function(x) {
x <- as.numeric(factor(x, levels = factor_level))})), 1, min)
# Return the index with associated character
forvis$f_vis_freqency <- factor_level[ind]
我使用microbenchmark
库来衡量效果。结果如下。
Unit: microseconds
expr min lq mean median uq max neval
S0 582.465 637.633 732.5772 654.953 675.4800 79961.281 5000
S1 14394.199 15048.831 16353.5785 15363.477 16724.0580 100905.644 5000
S2 1594.723 1667.852 1841.7731 1710.831 1800.6380 7175.609 5000
S3 3208.689 3319.986 3681.2132 3394.399 3619.2380 59523.688 5000
S4a 2041.194 2171.415 2438.0034 2232.034 2367.0660 59986.837 5000
S4b 3327.363 3457.585 3832.7902 3547.712 3774.1555 58306.156 5000
S5 554.882 624.803 686.8600 645.972 672.9145 5789.369 5000
令我惊讶的是,解决方案0(OP的原始解决方案)非常快,而解决方案5是最快的。
这是代码。
library(microbenchmark)
# Solution 0
F0 <- function(){
ifelse(forvis$f_type_1=="daily" | forvis$f_type_2=="daily" | forvis$f_type_3 =="daily" | forvis$f_type_4 == "daily" | forvis$f_type_5 == "daily", "daily",
ifelse(forvis$f_type_1=="weekly" | forvis$f_type_2=="weekly" | forvis$f_type_3 =="weekly" | forvis$f_type_4 == "weekly" | forvis$f_type_5 == "weekly", "weekly",
ifelse(forvis$f_type_1=="monthly" | forvis$f_type_2=="monthly" | forvis$f_type_3 =="monthly" | forvis$f_type_4 == "monthly" | forvis$f_type_5 == "monthly", "monthly",
ifelse(forvis$f_type_1=="annually" | forvis$f_type_2=="annually" | forvis$f_type_3 =="annually" | forvis$f_type_4 == "annually" | forvis$f_type_5 == "annually", "annually",
ifelse(forvis$f_type_1=="never" | forvis$f_type_2=="never" | forvis$f_type_3 =="never" | forvis$f_type_4 == "never" | forvis$f_type_5 == "never", "never",
NA))))
)
}
# Solution 1
F1 <- function(){
forvis2 <- forvis %>%
mutate(Person = row_number()) %>%
gather(Forest, Frequency, -Person) %>%
mutate(Frequency = factor(Frequency,
levels = c("daily", "weekly", "monthly",
"annually", "never"))) %>%
arrange(Person, Frequency) %>%
group_by(Person) %>%
slice(1)
forvis2$Frequency
}
# Solution 2
F2 <- function(){
forvis2 <- as.data.frame(t(forvis))
forvis_list <- lapply(forvis2,
factor,
levels = c("daily", "weekly", "monthly", "annually", "never"))
sapply(lapply(forvis_list, sort), `[`, 1)
}
# Solution 3
F3 <- function(){
forvis2 <- as.data.frame(lapply(forvis, factor,
levels = c("daily", "weekly", "monthly",
"annually", "never")))
sapply(1:nrow(forvis2), function(i) {sort(unlist(forvis2[i, ]))[1]})
}
# Solution 4a
F4a <- function(){
report_freq <- function(i, dt = forvis){
temp <- unique(unlist(dt[i, ]))
answer <- ifelse(any(temp %in% "daily"), "daily",
ifelse(any(temp %in% "weekly"), "weekly",
ifelse(any(temp %in% "monthly"), "monthly",
ifelse(any(temp %in% "annually"), "annually", "never"))))
return(answer)
}
sapply(1:nrow(forvis), report_freq, dt = forvis)
}
# Solution 4b
F4b <- function(){
report_freq <- function(i, dt = forvis){
temp <- unique(unlist(dt[i, ]))
temp <- factor(temp, levels = c(levels = c("daily", "weekly", "monthly",
"annually", "never")))
answer <- sort(temp)[1]
return(answer)
}
sapply(1:nrow(forvis), report_freq, dt = forvis)
}
# Solution 5
F5 <- function(){
factor_level <- c("daily", "weekly", "monthly", "annually", "never")
ind <- apply(as.data.frame(lapply(forvis, function(x) {
x <- as.numeric(factor(x, levels = factor_level))})), 1, min)
factor_level[ind]
}
# Measure the performance
microbenchmark(
S0 = F0(),
S1 = F1(),
S2 = F2(),
S3 = F3(),
S4a = F4a(),
S4b = F4b(),
S5 = F5(),
times = 5000
)