使用dplyr将数据有效地绑定到指定的bin中

时间:2019-03-26 22:06:59

标签: r dplyr

我有两个数据帧-一个7个bin的数据帧,指定每个bin的限制和名称(称为FJX_bins),以及一个波长-西格玛对的帧(test_spectra)。我想根据FJX_bins数据中的bin限制在Test_Spectra中创建一个名为bin_number的新变量。两者中的dput包含在下面。

使用mutatecase_when强行执行暴力攻击相对容易,但是这里的关键是我希望该解决方案可以扩展到任意数量的垃圾箱。我的感觉是,apply中可能有某种dplyr方法可以在这里使用,但是我想做的就是使用for循环,如下所示:

df <- test_spectra %>%
  mutate(bin_number = case_when(
    for(ii in 1:nrow(FJX_bins)){
      Wavelength >= FJX_bins$Lambda_Start[ii] & Wavelength < FJX_bins$Lambda_End[ii] ~
        FJX_bins$Bin_Number[ii]}
    ))

此策略失败,并引发错误

  

案例1(for (ii in 1:nrow(FJX_bins)) {...)必须是双向公式,不能为NULL

是否可以使用dplyr解决此问题?还是我需要退后一步看看applycut之类的东西?由于其他原因,我宁愿坚持使用dplyr框架,但也可以不在此范围之内。

谢谢

FJX_bins <- structure(list(Bin_Number = 1:7, Lambda_Start = c(289, 298.25, 
307.45, 312.45, 320.3, 345, 412.45), Lambda_End = c(298.25, 307.45, 
312.45, 320.3, 345, 412.45, 850)), row.names = c(NA, -7L), class = c("tbl_df", 
"tbl", "data.frame"), spec = structure(list(cols = structure(list(
    Bin_Number = structure(list(), class = c("collector_integer", 
    "collector")), Lambda_Start = structure(list(), class = c("collector_double", 
    "collector")), Lambda_End = structure(list(), class = c("collector_double", 
    "collector")), Effective_Lambda = structure(list(), class = c("collector_integer", 
    "collector"))), .Names = c("Bin_Number", "Lambda_Start", 
"Lambda_End", "Effective_Lambda")), default = structure(list(), class = c("collector_guess", 
"collector"))), .Names = c("cols", "default"), class = "col_spec"), .Names = c("Bin_Number", 
"Lambda_Start", "Lambda_End"))

test_spectra <- structure(list(Wavelength = c(289L, 290L, 291L, 292L, 293L, 294L, 
295L, 296L, 297L, 298L, 299L, 300L, 289L, 290L, 291L, 292L, 293L, 
294L, 295L, 296L, 297L, 298L, 299L, 300L, 289L, 290L, 291L, 292L, 
293L, 294L, 295L, 296L, 297L, 298L, 299L, 300L, 289L, 290L, 291L, 
292L, 293L, 294L, 295L, 296L, 297L, 298L, 299L, 300L), Sigma = c(3.97790085259898e-20, 
3.88773011066234e-20, 3.77170497723194e-20, 3.63990173255768e-20, 
3.53611020195826e-20, 3.39379425027765e-20, 3.24540998352932e-20, 
3.08629426249589e-20, 2.93243925380076e-20, 2.80431593390348e-20, 
2.64345023340469e-20, 2.49597804268261e-20, 4.79587956800083e-20, 
4.67040607723134e-20, 4.5134283789068e-20, 4.32731814710643e-20, 
4.13196812361237e-20, 3.93856298421813e-20, 3.77050786831795e-20, 
3.62340670271797e-20, 3.49404344374885e-20, 3.36066462681245e-20, 
3.20871974271263e-20, 3.03438697547602e-20, 5.27803299371575e-20, 
5.12475486084599e-20, 4.99112054163632e-20, 4.86399784101602e-20, 
4.73236079731255e-20, 4.56798834656559e-20, 4.36887241590191e-20, 
4.13697643104457e-20, 3.89697643104457e-20, 3.66909671059429e-20, 
3.46634646072095e-20, 3.28648835305714e-20, 5.71590756444018e-20, 
5.57618648066173e-20, 5.44949261656802e-20, 5.33110977304272e-20, 
5.21177991137917e-20, 5.07478142704849e-20, 4.9100984463428e-20, 
4.70660943398542e-20, 4.47661068638463e-20, 4.24314737804269e-20, 
4.02176301884806e-20, 3.82570654305878e-20)), row.names = c(NA, 
-48L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("Wavelength", 
"Sigma"))

3 个答案:

答案 0 :(得分:4)

fuzzyjoin实现了dplyr范围/间隔联接:

library(fuzzyjoin)

interval_left_join(
    FJX_bins, 
    test_spectra,
    by = c('Wavelength' = 'Lambda_Start', 'Wavelength' = 'Lambda_End')
)
# A tibble: 52 x 5
   Wavelength    Sigma Bin_Number Lambda_Start Lambda_End
        <int>    <dbl>      <int>        <dbl>      <dbl>
 1        289 3.98e-20          1          289       298.
 2        290 3.89e-20          1          289       298.
 3        291 3.77e-20          1          289       298.
 4        292 3.64e-20          1          289       298.
 5        293 3.54e-20          1          289       298.
 6        294 3.39e-20          1          289       298.
 7        295 3.25e-20          1          289       298.
 8        296 3.09e-20          1          289       298.
 9        297 2.93e-20          1          289       298.
10        298 2.80e-20          1          289       298.
# … with 42 more rows

答案 1 :(得分:3)

如果您找不到dplyr解决方案:

library(data.table)
setDT(test_spectra)
setDT(FJX_bins)
test_spectra[FJX_bins, 
             bin_number := i.Bin_Number, 
             on = .(Wavelength >= Lambda_Start, Wavelength < Lambda_End)]

这将与bins表进行非等价联接,并相应地设置bin编号。

答案 2 :(得分:2)

与dplyr:

要创建箱号作为因素

library(dplyr)
Test_Spectra <- mutate(test_spectra, 
                       bin = cut(Wavelength, breaks = c(FJX_bins$Lambda_Start, 850), 
                                 labels = FJX_bins$Bin_Number, right = F))

或创建bin号作为字符变量

Test_Spectra <- mutate(test_spectra, 
                       bin = as.character(cut(Wavelength, 
                                          breaks = c(FJX_bins$Lambda_Start, 850), 
                                          labels = FJX_bins$Bin_Number, right = F)))