这类似于以下的question。但是,我需要执行更多步骤:
•按列ID
和order
•对于val
中的每个df_dat
,请使用以下条件在ratio
表中查找对应的df_lookup
:
o If val < min(df_lookup$val), set new_ratio = min(df_lookup$ratio)
o If val > max(df_lookup$val), set new_ratio = max(df_lookup$ratio)
o If val falls within df_lookup$val range, do a simple linear interpolation
我的数据:
library(dplyr)
df_lookup <- tribble(
~ID, ~order, ~pct, ~val, ~ratio,
"batch1", 1, 1, 1, 0.2,
"batch1", 1, 10, 8, 0.5,
"batch1", 1, 25, 25, 1.2,
"batch2", 2, 1, 2, 0.1,
"batch2", 2, 10, 15, 0.75,
"batch2", 2, 25, 33, 1.5,
"batch2", 2, 50, 55, 3.2,
)
df_lookup
#> # A tibble: 7 x 5
#> ID order pct val ratio
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 batch1 1 1 1 0.2
#> 2 batch1 1 10 8 0.5
#> 3 batch1 1 25 25 1.2
#> 4 batch2 2 1 2 0.1
#> 5 batch2 2 10 15 0.75
#> 6 batch2 2 25 33 1.5
#> 7 batch2 2 50 55 3.2
df_dat <- tribble(
~order, ~ID, ~val,
1, "batch1", 0.1,
1, "batch1", 30,
1, "batch1", 2,
1, "batch1", 12,
2, "batch1", 45,
2, "batch2", 1.5,
2, "batch2", 30,
2, "batch2", 13,
2, "batch2", 60,
)
df_dat
#> # A tibble: 9 x 3
#> order ID val
#> <dbl> <chr> <dbl>
#> 1 1 batch1 0.1
#> 2 1 batch1 30
#> 3 1 batch1 2
#> 4 1 batch1 12
#> 5 2 batch1 45
#> 6 2 batch2 1.5
#> 7 2 batch2 30
#> 8 2 batch2 13
#> 9 2 batch2 60
先前的解决方案不尊重产生错误结果的分组。
示例:
对于order = 2
和ID = batch1
,new_ratio
应该为NA,因为这些条件不在查找表中。
对于order = 1
,ID = batch2
和val = 30
,new_ratio
不应高于1.2
(最大值ratio
)。
对于order = 1
,ID = batch1
和val = 2
,new_ratio = 0.243
,它是介于0.2和0.5之间的ratio
值。
任何帮助表示赞赏!
#error
df_dat %>%
group_by(ID, order) %>%
mutate(new_ratio = with(df_lookup, approx(val, ratio, val))$y)
#> Error: Column `new_ratio` must be length 4 (the group size) or one, not 7
#wrong output
df_dat %>%
group_by(ID, order) %>%
mutate(val1 = val) %>%
mutate(new_ratio = with(df_lookup, approx(val, ratio, val1))$y)
#> # A tibble: 9 x 5
#> # Groups: ID, order [3]
#> order ID val val1 new_ratio
#> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 batch1 0.1 0.1 NA
#> 2 1 batch1 30 30 1.39
#> 3 1 batch1 2 2 0.1
#> 4 1 batch1 12 12 0.643
#> 5 2 batch1 45 45 2.43
#> 6 2 batch2 1.5 1.5 0.15
#> 7 2 batch2 30 30 1.39
#> 8 2 batch2 13 13 0.679
#> 9 2 batch2 60 60 NA
预期产量
# A tibble: 9 x 4
order ID val new_ratio
<dbl> <chr> <dbl> <dbl>
1 1 batch1 0.1 0.2
2 1 batch1 30 1.2
3 1 batch1 2 0.243
4 1 batch1 12 0.643
5 2 batch1 45 NA
6 2 batch2 1.5 0.1
7 2 batch2 30 1.38
8 2 batch2 13 0.65
9 2 batch2 60 3.2
答案 0 :(得分:2)
library(dplyr)
df_dat %>%
left_join(df_lookup, by=c('ID','order'), suffix = c(".dat", ".lkp")) %>%
group_by(ID, order, val.dat) %>%
mutate(ratio_new = case_when(val.dat < min(val.lkp) ~ min(ratio),
val.dat > max(val.lkp) ~ max(ratio),
#Add ifelse to handle the scenarios where val.lkp and ratio are NAs as approx will fail in these scenarios
between(val.dat, min(val.lkp), max(val.lkp)) ~ ifelse(all(is.na(ratio)), NA_real_, approx(x=val.lkp, y=ratio, xout=val.dat)$y),
TRUE ~ NA_real_)) %>%
slice(1)
# A tibble: 9 x 7
# Groups: ID, order, val.dat [9]
order ID val.dat pct val.lkp ratio ratio_new
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 batch1 0.1 1 1 0.2 0.2
2 1 batch1 2 1 1 0.2 0.243
3 1 batch1 12 1 1 0.2 0.665
4 1 batch1 30 1 1 0.2 1.2
5 2 batch1 45 NA NA NA NA
6 2 batch2 1.5 1 2 0.1 0.1
7 2 batch2 13 1 2 0.1 0.65
8 2 batch2 30 1 2 0.1 1.38
9 2 batch2 60 1 2 0.1 3.2
答案 1 :(得分:2)
在roll
中使用rollends
和data.table
的选项:
df_lookup[, m := (ratio - shift(ratio, -1L)) / (val - shift(val, -1L))]
df_dat[, new_ratio :=
df_lookup[.SD, on=.(order, ID, val), roll=Inf, rollends=c(FALSE, FALSE),
x.m * (i.val - x.val) + x.ratio]
]
#for val in df_dat that are more than those in df_lookup
df_dat[is.na(new_ratio), new_ratio :=
df_lookup[copy(.SD), on=.(order, ID, val), roll=Inf, x.ratio]]
#for val in df_dat that are less than those in df_lookup
df_dat[is.na(new_ratio), new_ratio :=
df_lookup[copy(.SD), on=.(order, ID, val), roll=-Inf, x.ratio]]
输出:
order ID val new_ratio
1: 1 batch1 0.1 0.2000000
2: 1 batch1 30.0 1.2000000
3: 1 batch1 2.0 0.2428571
4: 1 batch1 12.0 0.6647059
5: 2 batch1 45.0 NA
6: 2 batch2 1.5 0.1000000
7: 2 batch2 30.0 1.3750000
8: 2 batch2 13.0 0.6500000
9: 2 batch2 60.0 3.2000000
数据:
library(data.table)
df_lookup <- fread('ID, order, pct, val, ratio
"batch1", 1, 1, 1, 0.2
"batch1", 1, 10, 8, 0.5
"batch1", 1, 25, 25, 1.2
"batch2", 2, 1, 2, 0.1
"batch2", 2, 10, 15, 0.75
"batch2", 2, 25, 33, 1.5
"batch2", 2, 50, 55, 3.2')
df_dat <- fread('order, ID, val
1, "batch1", 0.1
1, "batch1", 30
1, "batch1", 2
1, "batch1", 12
2, "batch1", 45
2, "batch2", 1.5
2, "batch2", 30
2, "batch2", 13
2, "batch2", 60')
最后两行代码也可以用非等号联接代替:
df_dat[is.na(new_ratio), new_ratio:=
df_lookup[copy(.SD), on=.(order, ID, val<val), x.ratio, mult="last"]]
df_dat[is.na(new_ratio), new_ratio:=
df_lookup[copy(.SD), on=.(order, ID, val>val), x.ratio, mult="first"]]
df_dat
答案 2 :(得分:2)
请使用data.table
我使用了很多中间步骤,因此您可以检查结果并操作每个步骤,并查看发生了什么,因此代码可以大大缩短。
library(data.table)
#set data to data.tables
setDT(df_dat); setDT(df_lookup)
#set range df_lookup values by ID and order combination
df_lookup[, `:=`( val2 = shift( val, type = "lead" ),
ratio2 = shift( ratio, type = "lead" ) ),
by = .( ID, order ) ][]
#join non-equi
df_dat[ df_lookup,
`:=`( val_start = i.val,
val_end = i.val2,
ratio_start = i.ratio,
ratio_end = i.ratio2 ),
on = .( ID, order, val > val, val < val2) ][]
#interpolatie new_ratio for values that fall within a range of dt_lookup
df_dat[, new_ratio := ratio_start + ( (val - val_start) * (ratio_end - ratio_start) / (val_end - val_start) )][]
#create data.table with ratio-value for minimum- and maximum value in df_lookup
df_lookup_min_max <- df_lookup[, .( val_min = min( val ), val_max = max( val ),
ratio_min = min( ratio ), ratio_max = max( ratio ) ),
by = .(ID, order) ]
df_lookup_min_max_melt <- melt( df_lookup_min_max,
id.vars = c( "ID", "order" ),
measure.vars = patterns( val = "^val",
ratio = "^ratio" ) )
df_dat[ is.na( new_ratio ),
new_ratio := df_lookup_min_max_melt[ df_dat[ is.na( new_ratio ), ],
ratio,
on = .(ID, order, val ),
roll = "nearest" ] ][]
df_dat[, `:=`(val_start = NULL, val_end = NULL, ratio_start = NULL, ratio_end = NULL)][]
最终输出
# order ID val new_ratio
# 1: 1 batch1 0.1 0.2000000
# 2: 1 batch1 30.0 1.2000000
# 3: 1 batch1 2.0 0.2428571
# 4: 1 batch1 12.0 0.6647059
# 5: 2 batch1 45.0 NA
# 6: 2 batch2 1.5 0.1000000
# 7: 2 batch2 30.0 1.3750000
# 8: 2 batch2 13.0 0.6500000
# 9: 2 batch2 60.0 3.2000000
修改
第5: 2 batch1 45.0 NA
行在这里,因为df_lookup中没有订单== 2和ID == batch1组合...
也许这是一个错字?
但是:代码似乎可以很好地处理;-)