Question

我有一个名为test_data的数据框：

date         test     score  
2015-10-26   test1      4.00  
2015-10-26   test2      4.99  
2015-10-26   test3      15.10  
2015-10-26   this continues to test23  
2016-05-01   test1      4.20  
2016-05-01   test2      5.50  
2016-05-01   test3      16.2  
2016-05-01   this continues to test23

我有一个数据框，其中包含每个测试的高阈值和低阈值：

test     low     high  
test1    3.0      6.0  
test2    6.0      8.0  
test3    12.0     18.0

......这继续测试23

我正在尝试评估test_data并确定分数是否超出阈值范围。例如，2015-10-26的test1不在阈值范围之外。但是，2015-10-26的test2超出了阈值范围。

这是蛮力方法：

test1_grp <- filter(test_data, test == 'test1')  
test1_grp <- mutate(test1_grp, out_thresh = if else((test1_grp$score > thresh[thresh$test == "test1", 3]) | (test1_grp$score < thresh[thresh$test == "test1", 2]), 'yes', 'no'))  
test2_grp <- filter(test_data, test == 'test2')  
test2_grp <- mutate(test2_grp, out_thresh = if else((test2_grp$score > thresh[thresh$test == "test2", 3]) | (test2_grp$score < thresh[thresh$test == "test2", 2]), 'yes', 'no'))

等等，以测试23。必须有一种更有效的方式。

我尝试使用group_by，但无法弄清楚如何为每个组分配高阈值和低阈值：

test_data %>% dplyr::group_by(test) %>% 
dplyr::mutate(out_thresh = ifelse((score > thresh[thresh$test == "test1",3]) | (score < thresh[thresh$test == "test1", 2]), 
'yes', 'no'))

数据（感谢@akrun）

test_data <- structure(list(date = c("2015-10-26", "2015-10-26", "2015-10-26", 
"2016-05-01", "2016-05-01", "2016-05-01"), test = c("test1", 
"test2", "test3", "test1", "test2", "test3"), score = c(4, 4.99, 
15.1, 4.2, 5.5, 16.2)), .Names = c("date", "test", "score"), 
class = "data.frame", row.names = c(NA, -6L))

threshold <- structure(list(test = c("test1", "test2", "test3"), low = c(3, 
 6, 12), high = c(6, 8, 18)), .Names = c("test", "low", "high"
), class = "data.frame", row.names = c(NA, -3L))

Answer 1

我认为你想要做的是merge两个data.frames在一起，然后使用ifelse声明：

merged_df <- merge(test_data, threshold, by = 'test')
merged_df$ThresholdFlag <- with(merged_df, ifelse(score > low & score < high, 'Yes','No'))

   test       date score low high ThresholdFlag
1 test1 2015-10-26  4.00   3    6           Yes
2 test1 2016-05-01  4.20   3    6           Yes
3 test2 2015-10-26  4.99   6    8            No
4 test2 2016-05-01  5.50   6    8            No
5 test3 2015-10-26 15.10  12   18           Yes
6 test3 2016-05-01 16.20  12   18           Yes

还可以使用data.table包来实现此目的：

library(data.table)
setDT(test_data)
setkey(test_data, test)
setDT(threshold)
setkey(threshold, test)

test_dt <- test_data[threshold][,ThresholdFlag := (score > low & score < high)]
         date  test score low high ThresholdFlag
1: 2015-10-26 test1  4.00   3    6          TRUE
2: 2016-05-01 test1  4.20   3    6          TRUE
3: 2015-10-26 test2  4.99   6    8         FALSE
4: 2016-05-01 test2  5.50   6    8         FALSE
5: 2015-10-26 test3 15.10  12   18          TRUE
6: 2016-05-01 test3 16.20  12   18          TRUE

Answer 2

我们可以使用left_join

library(dplyr)
left_join(test_data, threshold, by = "test") %>%
             mutate(grp = score >low & score < high)
#        date  test score low high   grp
#1 2015-10-26 test1  4.00   3    6  TRUE
#2 2015-10-26 test2  4.99   6    8 FALSE
#3 2015-10-26 test3 15.10  12   18  TRUE
#4 2016-05-01 test1  4.20   3    6  TRUE
#5 2016-05-01 test2  5.50   6    8 FALSE
#6 2016-05-01 test3 16.20  12   18  TRUE

也可以使用match

中的base R来完成此操作

i1 <- match(test_data$test, threshold$test)
with(threshold, test_data$score > low[i1] & test_data$score < high[i1])
#[1]  TRUE FALSE  TRUE  TRUE FALSE  TRUE

注意：我在这里使用TRUE/FALSE代替yes/no，因为它更容易操作

如何在R中使用两个不同的变量向数据框添加新列？

2 个答案: