寻求一种更简洁的方法来避免嵌套的if语句(带有sapply)

时间:2016-07-18 10:03:15

标签: r if-statement sapply

我有2个数据框

lookup_table <- data.frame(Country = c("UK","France", "Germany"), A = c(0,0,1), B = c(1,6,7), C = c(4,8,9))
set.seed(123) # for being reproducible
df <-  data.frame(Country = c("UK","UK","France","France","Germany","Germany","Germany","France","UK"), Values =  runif(9, 1, 10)) 

我想在df中有第3列,它根据第2列中的值和国家/地区的值来分配类。

如下所示,但价值不应该固定:它们应该取决于价值&amp;查找表中的国家/地区

Class <- function(x) { 
  if(x > 0 & x <= 1) y <- "A"
  if(x > 1 & x <= 4) y <- "B"
  if(x > 4) y <- "C"
  return(y)
}

df$Class <- sapply(df$Values,Class)

提前感谢您提供任何帮助

5 个答案:

答案 0 :(得分:1)

我们可以在&#39; lookup_table&#39;之间进行join。和&#39; df&#39; on&#39;国家&#39;,melt它到&#39;长&#39;格式。如@ zx8754所述,请使用按&#39;国家&#39;分组的cut功能。 (或findInterval获取数字索引,使用该索引获取相应的变量&#39;,将其指定为&#39; newVar&#39;

library(data.table)
d1 <- melt(setDT(lookup_table)[df, on = "Country"], id.var = c("Country", "Values"))[,
          newVar:=unique(variable)[findInterval(Values, unique(value))], Country]

对感兴趣的列进行子集并获取unique

unique(d1[, c("Country", "Values", "newVar"), with = FALSE])
#   Country   Values newVar
#1:      UK 3.588198      B
#2:      UK 8.094746      C
#3:  France 4.680792      A
#4:  France 8.947157      C
#5: Germany 9.464206      C
#6: Germany 1.410008      A
#7: Germany 5.752949      A
#8:  France 9.031771      C
#9:      UK 5.962915      C

答案 1 :(得分:1)

这是dplyr解决方案。

library(dplyr)
df %>%
  inner_join(lookup_table, by = "Country") %>%
  mutate(Class = ifelse(Values > A & Values < B, "A", 
                 ifelse(Values > B & Values < C, "B", 
                 ifelse(Values > C, "C", "Not_found"))))

在管道末尾添加select(-c(A,B,C))以获得更清晰的输出data.frame。作为此方法的额外好处,任何不属于范围的值都将标记为"Not_found"

答案 2 :(得分:0)

另一种选择:

df <- merge(df, lookup_table, by='Country', all.x=T)

df$Class <- 'A'                         # default
df$Class <- with(df, replace(Class, Values > B & Values <= C, 'B'))
df$Class <- with(df, replace(Class, Values > C, 'C'))
df
#  Country   Values A B C Class
#1  France 2.371120 0 6 8     A
#2  France 6.155804 0 6 8     B
#3  France 5.635268 0 6 8     A
#4 Germany 9.661230 1 7 9     C
#5 Germany 6.412292 1 7 9     A
#6 Germany 3.148534 1 7 9     A
#7      UK 4.661493 0 1 4     C
#8      UK 6.933073 0 1 4     C
#9      UK 4.623160 0 1 4     C

您可以从结果中删除任何不需要的列。

答案 3 :(得分:0)

这是基础R的结果:

dfa<-merge(lookup_table,df)
Class <- function(x) { 
  if(x[5] > x[2] & x[5] <= x[3]) y <- "A"
  if(x[5] > x[3] & x[5] <= x[4]) y <- "B"
  if(x[5] > x[4]) y <- "C"
  return(y)
}
dfa$Class <- sapply(1:nrow(dfa),function(ri)Class(dfa[ri,]))
dfa[,-c(2:4)]

> dfa[,-c(2:4)]
  Country   Values Class
1  France 4.680792     A
2  France 8.947157     C
3  France 9.031771     C
4 Germany 1.410008     A
5 Germany 5.752949     A
6 Germany 9.464206     C
7      UK 3.588198     B
8      UK 8.094746     C
9      UK 5.962915     C

答案 4 :(得分:0)

如果您更改已指定时间间隔的lookup_table的表单,则可以使用data.table,v1.9.7的开发版本中的non-equi联接轻松执行此任务({{3 }}):

require(data.table) #v1.9.7+
setDT(df)[lookup, Class := i.Class, on = .(Country, Values > value1, Values <= value2)]
#    Country   Values Class
# 1:      UK 3.588198     B
# 2:      UK 8.094746     C
# 3:  France 4.680792     A
# 4:  France 8.947157     C
# 5: Germany 9.464206     C
# 6: Germany 1.410008     A
# 7: Germany 5.752949     A
# 8:  France 9.031771     C
# 9:      UK 5.962915     C

## i.Class refers to Class from i argument = lookup$Class

其中lookuplookup_table构成,如下所示:

setDT(lookup_table)[, D := Inf]
lookup = lookup_table[, .(Country, 
                          Class = rep(c("A", "B", "C"), each=.N), 
                          value1 = c(A, B, C), 
                          value2 = c(B, C, D))]
#    Country Class value1 value2
# 1:      UK     A      0      1
# 2:  France     A      0      6
# 3: Germany     A      1      7
# 4:      UK     B      1      4
# 5:  France     B      6      8
# 6: Germany     B      7      9
# 7:      UK     C      4    Inf
# 8:  France     C      8    Inf
# 9: Germany     C      9    Inf