根据唯一条件从数据中选择行

时间:2018-11-13 11:55:29

标签: r dplyr data.table

dat <- structure(list(doy = c(274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 
                          295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 
                          316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 
                          337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 
                          358, 359, 360, 361, 362, 363, 364, 365), 
                  no.plant = c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 
                               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 
                               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), 
                  cum.value = c(0, 1.34973713866726e-05, 0.000107973870218436, 0.000364365089792096, 0.000863464598244823, 0.00168576031682954, 
                                0.00291120609231443, 0.00291120609231443, 0.0046189294134239, 0.00688687680728461, 0.00688687680728461, 
                                0.00979139917551386, 0.0134067801825104, 0.0178047117788614, 0.0230537220148601, 0.0292185614529241, 
                                0.0292185614529241, 0.0363595556987137, 0.0363595556987137, 0.0445319328097977, 0.0537851355741434, 
                                0.0641621298405947, 0.0756987211882645, 0.0884228931969177, 0.102354181379628, 0.102354181379628, 0.117503097415405, 
                                0.133870618627253, 0.151447757647197, 0.151447757647197, 0.170215226855778, 0.170215226855778, 
                                0.190143211447851, 0.211191263836225, 0.233308330547831, 0.256432920794094, 0.280493423522773, 0.305408577012532, 
                                0.331088091999851, 0.357433425992349, 0.384338702900249, 0.411691768499651, 0.439375368630229, 0.467268433537531, 
                                0.495247448513112, 0.523187888081939, 0.550965688550059, 0.578458731861707, 0.605548312515632, 0.632120558828558, 
                                0.658067780159839, 0.683289712849355, 0.707694639565394, 0.731200359474982, 0.753734990069534, 0.753734990069534, 
                                0.753734990069534, 0.753734990069534, 0.775237585508182, 0.795658560857758, 0.814959916467899, 0.833115261761304, 
                                0.850109642771837, 0.865939182653005, 0.865939182653005, 0.880610548937487, 0.894140265397845, 0.906553889802375, 
                                0.917885081566473, 0.928174585188328, 0.93746915638157, 0.945820457966355, 0.95328395187962, 0.959917812174526, 
                                0.965781881688334, 0.970936692282333, 0.975442565331355, 0.97935880560985, 0.97935880560985, 0.982742998037354, 
                                0.985650413056059, 0.988133522855331, 0.990241627354782, 0.992020585910824, 0.993512648199701, 0.994756375705273, 
                                0.995786643728671, 0.996634712840931, 0.997328358197721, 0.997892045086969, 0.998347139430071, 0.998347139430071)), 
             class = "data.frame", row.names = c(NA, -92L))


  delta <- 0.04991736

我需要选择doy到达cum.value1*delta2*delta3*delta .... {{ 1}}以及 如果4*delta未达到doy 365,则包括n*delta的最后一个doy。

此刻,我正在通过反复试验来选择365,这是通过首先创建n*delta的顺序来实现的。例如1:19:

n

如果我将1:n更改为1:20

qt.vec.19 <- 1:19 * delta
max(qt.vec.19) >=  max(dat$cum.value)
FALSE

这意味着我可以做1 * delta,2 * delta .... 19 * delta,然后选择最后一个doy。

qt.vec

我想知道是否有更好的方法来做到这一点,例如选择我的qt.vec.20 <- 1:20 * delta max(qt.vec.20) >= max(dat$cum.value) TRUE 值或避免使用较长的sample.dat <- dat %>% dplyr::slice(unique(c(which.max(cum.value > qt.vec.19[1]), which.max(cum.value > qt.vec.19[2]), which.max(cum.value > qt.vec.19[3]), which.max(cum.value > qt.vec.19[4]), which.max(cum.value > qt.vec.19[5]), which.max(cum.value > qt.vec.19[6]), which.max(cum.value > qt.vec.19[7]), which.max(cum.value > qt.vec.19[8]), which.max(cum.value > qt.vec.19[9]), which.max(cum.value > qt.vec.19[10]), which.max(cum.value > qt.vec.19[11]), which.max(cum.value > qt.vec.19[12]), which.max(cum.value > qt.vec.19[13]), which.max(cum.value > qt.vec.19[14]), which.max(cum.value > qt.vec.19[15]), which.max(cum.value > qt.vec.19[16]), which.max(cum.value > qt.vec.19[17]), which.max(cum.value > qt.vec.19[18]), which.max(cum.value > qt.vec.19[19])))) last.doy <- dat %>% dplyr::filter(doy == 365) all.doy <- as.data.frame(rbind(sample.dat, last.doy)) doy no.plant cum.value 294 0 0.05378514 298 0 0.10235418 302 0 0.15144776 307 0 0.21119126 309 0 0.25643292 311 0 0.30540858 313 0 0.35743343 315 0 0.41169177 317 0 0.46726843 319 0 0.52318789 320 0 0.55096569 322 0 0.60554831 324 0 0.65806778 326 0 0.70769464 328 0 0.75373499 334 0 0.81495992 336 0 0.85010964 341 0 0.90655389 346 0 0.95328395 365 1 0.99834714 部分?

2 个答案:

答案 0 :(得分:1)

关于口味和上下文的问题,您读了很多关于“循环在R中皱眉”的信息-但是它们提供了结果并且易于阅读,并且它们是Base R-不需要额外的程序包或新的语法来学习:

options( scipen = 10, digits = 15 )          # display all digits
dat <- read.csv( "crop89.csv" )              # load your data from a file
delta <- 0.04991736                          # selected threshold
n <- 1                                       # initiate multiplier variable
all.doy <- dat[ 1, ]                         # initiate receiving data.frame

for( i in 1:length( dat$doy ) ){             # loop through dat rows
    if( dat[ i, "cum.value"] >= n * delta ){ # as soon as threshold is passed
        all.doy[ n, ] <- dat[ i, ]           # write the line to the target data.frame
        n <- n + 1                           # increment multiplier
    }
}
all.doy[ n, ] <- dat[ i, ]                   # add the last row anyway

all.doy
> all.doy
   doy no.plant          cum.value
1  294        0 0.0537851355741434
25 298        0 0.1023541813796280
29 302        0 0.1514477576471970
34 307        0 0.2111912638362250
36 309        0 0.2564329207940940
38 311        0 0.3054085770125320
40 313        0 0.3574334259923490
42 315        0 0.4116917684996510
44 317        0 0.4672684335375310
46 319        0 0.5231878880819389
47 320        0 0.5509656885500590
49 322        0 0.6055483125156320
51 324        0 0.6580677801598390
53 326        0 0.7076946395653940
55 328        0 0.7537349900695340
61 334        0 0.8149599164678990
63 336        0 0.8501096427718370
68 341        0 0.9065538898023749
73 346        0 0.9532839518796200
92 365        1 0.9983471394300710

答案 1 :(得分:0)

要点是此处的cut函数:

library(data.table)
DT<-as.data.table(dat)
DT[,group:=as.numeric(cut(cum.value,c(-Inf,qt.vec.19,Inf),ordered_result = T))-1]
DT[,position:=frank(cum.value,ties.method = "first" ),by=group]
DT<-DT[position==1 & group>0]
DT[,position:=NULL]
DT[,group:=NULL]
if (max(DT$cum.value)!=max(dat$cum.value)) DT<-rbind(DT,dat[dat$doy==max(dat$doy),])